## Human Reference Genome 

In [2]:
import pandas as pd
import pickle
from Bio import SeqIO

Load dataset

In [3]:
def load_data_from_pickle(path):
    # Load the pickle file
    with open(path, 'rb') as f:
        data = pickle.load(f)
    return data

In [4]:
FILE_PATH = 'human_genome/human_genome_rna.pkl'
PROTEIN_FILE_PATH = 'human_genome/human_genome_protein.pkl'

human_genome_df = load_data_from_pickle(FILE_PATH)
protein_df = load_data_from_pickle(PROTEIN_FILE_PATH)

In [5]:
human_genome_df.head(2)

Unnamed: 0,ID,sequence
0,NR_168385.1,AGCAGGGCGTCCAGCGGAGAAGGCAGAGGAGGGGAGATGCGGGCTC...
1,NR_168384.1,AGCAGGGCGTCCAGCGGAGAAGGCAGAGGAGGGGAGATGCGGGCTC...


Nucleotides allocation of symbols:
- Guanine. adenine, thymine, cytosine: G,A,T,C
- Purine (adenine or guanine): R
- Pyrimidine (thymine or cytosine): Y
- Adenine or thymine: W
- Guanine or cytosine: S
- Adenine or cytosine: M
- Guanine or thymine: K
- Adenine or thymine or cytosine: H
- Guanine or cytosine or thymine: B
- Guanine or adenine or cytosine: V
- Guanine or adenine or thymine: D
- Guanine or adenine or thymine or cytosine: N

Explore sequences category:

In [6]:
human_genome_df['category'] = human_genome_df['ID'].str.split('_', expand=True)[0]
human_genome_df.head(2)

Unnamed: 0,ID,sequence,category
0,NR_168385.1,AGCAGGGCGTCCAGCGGAGAAGGCAGAGGAGGGGAGATGCGGGCTC...,NR
1,NR_168384.1,AGCAGGGCGTCCAGCGGAGAAGGCAGAGGAGGGGAGATGCGGGCTC...,NR


In [7]:
human_genome_df['category'].value_counts()

XM    131642
NM     53668
XR     51976
NR     23224
Name: category, dtype: int64

In [8]:
protein_df['category'] = protein_df['ID'].str.split('_', expand=True)[0]
protein_df['category'].value_counts()

XP    131642
NP     53668
YP        13
Name: category, dtype: int64

| Category | Description                                    | Comment                                          |
|----------|------------------------------------------------|--------------------------------------------------|
| NC       | Complete genomic molecules                     |                                                  |
| NG       | Incomplete genomic region                      |                                                  |
| NM       | mRNA                                           | Protein-coding transcripts (usually curated)      |
| NR       | ncRNA                                          | Non-protein-coding transcripts                    |
| XM       | Predicted mRNA model                           | Predicted model protein-coding transcript         |
| XR       | Predicted ncRNA model                          | Predicted model non-protein-coding transcript     |
| NP       | Protein                                        | Associated with an NM_ or NC_ accession           |
| YP       | Protein	                                    | Annotated on genomic molecules without an instantiated transcript record|
| XP       | Predicted Protein model (eukaryotic sequences) | Associated with an XM_ accession                  |
| WP       | Predicted Protein model (prokaryotic sequences)| Non-redundant across multiple strains and species |

In [9]:
categories = list(set(human_genome_df['category'].unique()))+list(set(protein_df['category'].unique()))
categories

['NM', 'NR', 'XR', 'XM', 'XP', 'YP', 'NP']

In [10]:
columns = ['ID']
for category in categories:
    columns.append(category+'_ID')
    columns.append(category+'_sequence')

final_df = pd.DataFrame(columns=columns)

In [11]:
human_genome_df.shape

(260510, 3)

Check if we have same sample for different category in dataset:

In [12]:
def id(row):
    return row['ID'].split('_')[1]

id_number_list = human_genome_df.apply(id, axis=1)

In [13]:
human_genome_df.shape[0] - id_number_list.nunique()

67

In [14]:
id_protein_list = protein_df.apply(id, axis=1)
protein_df.shape[0] - id_protein_list.nunique() # no

0

In [15]:
# check if the ID in the human_genome_df is in the protein_df

for id in id_number_list:
    if id in id_protein_list:
        print(id) # no

Save dataset with only protein-coding data

In [16]:
human_genome_df

Unnamed: 0,ID,sequence,category
0,NR_168385.1,AGCAGGGCGTCCAGCGGAGAAGGCAGAGGAGGGGAGATGCGGGCTC...,NR
1,NR_168384.1,AGCAGGGCGTCCAGCGGAGAAGGCAGAGGAGGGGAGATGCGGGCTC...,NR
2,NR_168380.1,AGTCCCAGGGAGGAGACCGCGGGAGAGGCGGCGGGACCAGGGTCCC...,NR
3,NR_168400.1,GCACACCTGGCTCACGGCGAGTGCGGAGCAGAAAGCACTACTGGCG...,NR
4,NR_028389.1,GCTACACTTAGTGACTCTGAGGGACATGCAACCCTCCCCGCATGCT...,NR
...,...,...,...
63840,NM_001363973.3,AGTAACTGAGACCTCACCGCGTTGACCCAGCACCAGGGCTCGCGGG...,NM
63841,NM_001363974.2,AGTAACTGAGACCTCACCGCGTTGACCCAGCACCAGGGCTCGCGGG...,NM
63842,NR_157026.1,CGTTGCCTTGGCTACACCGTCTGTTAGGGCCGCGCACGAGATCAGT...,NR
63843,NR_024605.3,AGTAACTGAGACCTCACCGCGTTGACCCAGCACCAGGGCTCGCGGG...,NR


In [20]:
# potein-coding category: NM XM
coding_sequences_df = human_genome_df[human_genome_df['category'].str.contains('NM') | human_genome_df['category'].str.contains('XM')]
coding_sequences_df.shape

(185310, 3)

In [26]:
coding_sequences_df.index = range(coding_sequences_df.shape[0]) # reset index
coding_sequences_df.head(2)

Unnamed: 0,ID,sequence,category
0,NM_001368254.1,GCTGAGCTGAGCTGGGGCGCAGCCGCCTGTCTGCACCGGCAGCACC...,NM
1,NM_001350977.1,TCCACAACTGAAACATCCACTTCTGAACACCATGTCCTACTACAGC...,NM


In [22]:
coding_sequences_df[coding_sequences_df['category'].str.contains('XM')].sample(5)

Unnamed: 0,ID,sequence,category
26162,XM_054365963.1,AGAAGGTCAGCAAAGGAAAGTGGAAGTTGGATTCTGAAAGATCGAG...,XM
10544,XM_011526192.2,GAGCCTGCGAGTCCGCGAGCCAGCGAGCTGCGGCTGCGGCCTCCCC...,XM
20328,XM_054337381.1,CCCCATTTGTGACAGTCAGGTGTGTAGCTGGGACGGTGCTGGTCTG...,XM
12427,XM_005266279.5,GACCTAGCGTGTGCTCAGCTCTGGACAAGACATGGATGTAGGCAGA...,XM
862,XM_011532566.3,TGACAGCTCATACCCTGGAGGAGGGCCACTATGTCATCGGGCCCAA...,XM


In [23]:
coding_sequences_df[coding_sequences_df['category'].str.contains('NM')].sample(5)

Unnamed: 0,ID,sequence,category
4348,NM_001164692.3,ATTCTTGTCTTACCCTCTGCAAATGTGATAGGCACAGGACAGGAGT...,NM
4329,NM_001286109.2,ACGTGATCCGACAAACGGCCTCTGCATAGTGCAGAACATTCTGCTG...,NM
4659,NM_002028.4,AATGCGCGTTGTTGCTTAACGAAGCAGAGTCCTACACACTGTCTGC...,NM
6509,NM_001394498.1,GCCATCTTTGTTGGGGGCAGCCAGGCCTGGCTCGAGATGCCGAAGT...,NM
661,NM_001318976.1,ATCTCGCGTTTCCGGCCGGAAGCTTCTCCAGCCTTTCCCGGAAGCT...,NM


Save dataset:

In [27]:
PATH = 'human_genome/'

coding_sequences_df.to_csv(PATH+'human_genome_protein_coding_seq.csv', index=False)
coding_sequences_df.to_pickle(PATH+'human_genome_protein_coding_seq.pkl')