## Human Reference Genoma 

In [1]:
import pandas as pd
import pickle
from Bio import SeqIO

Load dataset

In [2]:
def load_data_from_pickle(path):
    # Load the pickle file
    with open(path, 'rb') as f:
        data = pickle.load(f)
    return data

In [3]:
FILE_PATH = 'human_genoma/human_genoma_rna.pkl'
PROTEIN_FILE_PATH = 'human_genoma/human_genoma_protein.pkl'

human_genoma_df = load_data_from_pickle(FILE_PATH)
protein_df = load_data_from_pickle(PROTEIN_FILE_PATH)

In [4]:
human_genoma_df.head(2)

Unnamed: 0,ID,sequence
0,NR_168385.1,AGCAGGGCGTCCAGCGGAGAAGGCAGAGGAGGGGAGATGCGGGCTC...
1,NR_168384.1,AGCAGGGCGTCCAGCGGAGAAGGCAGAGGAGGGGAGATGCGGGCTC...


Nucleotides allocation of symbols:
- Guanine. adenine, thymine, cytosine: G,A,T,C
- Purine (adenine or guanine): R
- Pyrimidine (thymine or cytosine): Y
- Adenine or thymine: W
- Guanine or cytosine: S
- Adenine or cytosine: M
- Guanine or thymine: K
- Adenine or thymine or cytosine: H
- Guanine or cytosine or thymine: B
- Guanine or adenine or cytosine: V
- Guanine or adenine or thymine: D
- Guanine or adenine or thymine or cytosine: N

Explore sequences category:

In [5]:
human_genoma_df['category'] = human_genoma_df['ID'].str.split('_', expand=True)[0]
human_genoma_df.head(2)

Unnamed: 0,ID,sequence,category
0,NR_168385.1,AGCAGGGCGTCCAGCGGAGAAGGCAGAGGAGGGGAGATGCGGGCTC...,NR
1,NR_168384.1,AGCAGGGCGTCCAGCGGAGAAGGCAGAGGAGGGGAGATGCGGGCTC...,NR


In [6]:
human_genoma_df['category'].value_counts()

XM    131642
NM     53668
XR     51976
NR     23224
Name: category, dtype: int64

In [7]:
protein_df['category'] = protein_df['ID'].str.split('_', expand=True)[0]
protein_df['category'].value_counts()

XP    131642
NP     53668
YP        13
Name: category, dtype: int64

| Category | Description                                    | Comment                                          |
|----------|------------------------------------------------|--------------------------------------------------|
| NC       | Complete genomic molecules                     |                                                  |
| NG       | Incomplete genomic region                      |                                                  |
| NM       | mRNA                                           | Protein-coding transcripts (usually curated)      |
| NR       | ncRNA                                          | Non-protein-coding transcripts                    |
| XM       | Predicted mRNA model                           | Predicted model protein-coding transcript         |
| XR       | Predicted ncRNA model                          | Predicted model non-protein-coding transcript     |
| NP       | Protein                                        | Associated with an NM_ or NC_ accession           |
| YP       | Protein	                                    | Annotated on genomic molecules without an instantiated transcript record|
| XP       | Predicted Protein model (eukaryotic sequences) | Associated with an XM_ accession                  |
| WP       | Predicted Protein model (prokaryotic sequences)| Non-redundant across multiple strains and species |

In [8]:
categories = list(set(human_genoma_df['category'].unique()))+list(set(protein_df['category'].unique()))
categories

['XR', 'XM', 'NM', 'NR', 'YP', 'NP', 'XP']

In [9]:
columns = ['ID']
for category in categories:
    columns.append(category+'_ID')
    columns.append(category+'_sequence')

final_df = pd.DataFrame(columns=columns)

In [18]:
human_genoma_df.shape

(260510, 3)

Check if we have same sample for different category in dataset:

In [16]:
def id(row):
    return row['ID'].split('_')[1]

id_number_list = human_genoma_df.apply(id, axis=1)

In [19]:
human_genoma_df.shape[0] - id_number_list.nunique()

67

In [20]:
id_protein_list = protein_df.apply(id, axis=1)
protein_df.shape[0] - id_protein_list.nunique() # no

0

In [23]:
# check if the ID in the human_genoma_df is in the protein_df

for id in id_number_list:
    if id in id_protein_list:
        print(id) # no

Save dataset with only protein-coding data

In [25]:
human_genoma_df

Unnamed: 0,ID,sequence,category
0,NR_168385.1,AGCAGGGCGTCCAGCGGAGAAGGCAGAGGAGGGGAGATGCGGGCTC...,NR
1,NR_168384.1,AGCAGGGCGTCCAGCGGAGAAGGCAGAGGAGGGGAGATGCGGGCTC...,NR
2,NR_168380.1,AGTCCCAGGGAGGAGACCGCGGGAGAGGCGGCGGGACCAGGGTCCC...,NR
3,NR_168400.1,GCACACCTGGCTCACGGCGAGTGCGGAGCAGAAAGCACTACTGGCG...,NR
4,NR_028389.1,GCTACACTTAGTGACTCTGAGGGACATGCAACCCTCCCCGCATGCT...,NR
...,...,...,...
63840,NM_001363973.3,AGTAACTGAGACCTCACCGCGTTGACCCAGCACCAGGGCTCGCGGG...,NM
63841,NM_001363974.2,AGTAACTGAGACCTCACCGCGTTGACCCAGCACCAGGGCTCGCGGG...,NM
63842,NR_157026.1,CGTTGCCTTGGCTACACCGTCTGTTAGGGCCGCGCACGAGATCAGT...,NR
63843,NR_024605.3,AGTAACTGAGACCTCACCGCGTTGACCCAGCACCAGGGCTCGCGGG...,NR


In [26]:
# potein-coding category: NM XM
protein_coding_df = human_genoma_df[[human_genoma_df['category'] == 'NM' | human_genoma_df['category'] == 'XM']]
protein_coding_df.shape

TypeError: Cannot perform 'ror_' with a dtyped [object] array and scalar of type [bool]