## Data Access:

In [1]:

import pandas as pd
from Bio import Entrez, SeqIO

# Provide your email address to NCBI
Entrez.email = "hzkhan@ucdavis.edu"


def fetch_sequence(accession):
    ### Fetch sequence from NCBI with accession number
    handle = Entrez.efetch(db="nucleotide", id=accession, rettype="fasta")
    record = SeqIO.read(handle, "fasta")
    handle.close()
    return record

# List of accession numbers
accession_numbers = ['NC_011137', 'KC345764', 'FN673705', 'NC_002083',
                     'NC_001646', 'NC_011120', 'KM242275', 'NC_001644', 
                     'JF727201', 'KM679417', 'JF727217', 'JF727180', 
                     'NC_033885', 'NC_014047', 'NC_014051', 'NC_014042']

# List of names
names = ['European Neanderthal', 'South African Human', 'Altai Denisovan', 'Sumatran Orangutan', 
         'Bornean Orangutan', 'Western Lowland Gorilla', 'Eastern Lowland Gorilla', 'Bonobo',
          'Chimp Schweinfurthii', 'Chimp Ellioti', 'Chimp Verus', 'Chimp Troglodytes', 
          'Hoolock Gibbon', 'Symphalangus Gibbon', 'Nomascus Gibbon', 'Hylobates Gibbon']

def fetch_sequences(accession_numbers, names):
    '''
    Input: a list of accession numbers needed to access the sequence data from the API and a list of the species names
    Output: a dictionary where the scientific names of the organisms are the keys and the DNA sequence is the value
    '''

    # Go through each accession number, get the data, and add it to the dictionary
    species_dna_data = {}
    for accession, name in zip(accession_numbers, names):
        # Fetch the GenBank file from NCBI using the accession number and read the data
        handle = Entrez.efetch(db="nucleotide", id=accession, rettype="gb", retmode = 'text')
        record = SeqIO.read(handle, "gb")
        handle.close()
        # Extract the scientific name, the DNA sequence, and the length of the sequence from the GenBank format
        scientific_name = record.annotations.get('organism', None)
        sequence = str(record.seq)

        # Add the species to the dictionary with its scientific name, name, and DNA sequence
        species_dna_data[accession] = [scientific_name, name, sequence]
    return species_dna_data




species_dna_data = fetch_sequences(accession_numbers, names)

species_dna_df = pd.DataFrame.from_dict(species_dna_data, orient = 'index')
species_dna_df = species_dna_df.reset_index()
species_dna_df.columns = ['accession_number', 'scientific_name', 'name', 'mt_dna']


species_dna_df

Unnamed: 0,accession_number,scientific_name,name,mt_dna
0,NC_011137,Homo sapiens neanderthalensis,European Neanderthal,GATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCAT...
1,KC345764,Homo sapiens,South African Human,GATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCAT...
2,FN673705,Homo sapiens subsp. 'Denisova',Altai Denisovan,GATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCAT...
3,NC_002083,Pongo abelii,Sumatran Orangutan,GTTTATGTAGCTTATTCTATCCAAAGCAATGCACTGAAAATGTCTC...
4,NC_001646,Pongo pygmaeus,Bornean Orangutan,GTTTATGTAGCTTATTCCATCCAAAGCAATACACTGAAAATGTCTC...
5,NC_011120,Gorilla gorilla gorilla,Western Lowland Gorilla,GTTTATGTAGCTTACCTCCCCAAAGCAATACACTGAAAATGTTTCG...
6,KM242275,Gorilla beringei graueri,Eastern Lowland Gorilla,GTTTATGTAGCTTACCTCCCCAAAGCAATACACTGAAAATGTTTCG...
7,NC_001644,Pan paniscus,Bonobo,GTTTATGTAGCTTACCCCCTTAAAGCAATACACTGAAAATGTTTCG...
8,JF727201,Pan troglodytes schweinfurthii,Chimp Schweinfurthii,GTTTATGTAGCTTACCCCCTCAAAGCAATACACTGAAAATGTTTCG...
9,KM679417,Pan troglodytes ellioti,Chimp Ellioti,GTTTATGTAGCTTACCCCCTCAAAGCAATACACTGAAAATGTTTCG...


## Database Setup:

In [2]:

import sqlite3 as sql

path = "data.sqlite"

conn = sql.connect(path)
cur = conn.cursor()


### convert df to db
species_dna_df.to_sql("MtDNA", conn, if_exists="replace", index=False)



### Add sequence length column
cur.execute('''ALTER TABLE MtDNA ADD COLUMN length INT;''')

### Popluate sequence length column using mtdna column
cur.execute('''UPDATE MtDNA
               SET length = LENGTH(mt_dna);''')



### Add category column
cur.execute('''ALTER TABLE MtDNA ADD COLUMN category TEXT;''')

### Popluate category column using the name & scientific_name columns
cur.execute('''UPDATE MtDNA
               SET category =
                    CASE
                        WHEN scientific_name LIKE '%homo%' THEN 'Human'
                        WHEN scientific_name LIKE '%pan%' THEN 'Chimp'
                        WHEN scientific_name LIKE '%gorilla%' THEN 'Gorilla'
                        WHEN scientific_name LIKE '%pongo%' THEN 'Orangutan'
                        WHEN name LIKE '%gibbon%' THEN 'Gibbon'
                        ELSE 'other'
                    END;''')

species_dna_db = pd.read_sql_query('SELECT * FROM MtDNA', conn)

conn.commit()
conn.close()



species_dna_db

Unnamed: 0,accession_number,scientific_name,name,mt_dna,length,category
0,NC_011137,Homo sapiens neanderthalensis,European Neanderthal,GATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCAT...,16565,Human
1,KC345764,Homo sapiens,South African Human,GATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCAT...,16567,Human
2,FN673705,Homo sapiens subsp. 'Denisova',Altai Denisovan,GATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCAT...,16570,Human
3,NC_002083,Pongo abelii,Sumatran Orangutan,GTTTATGTAGCTTATTCTATCCAAAGCAATGCACTGAAAATGTCTC...,16499,Orangutan
4,NC_001646,Pongo pygmaeus,Bornean Orangutan,GTTTATGTAGCTTATTCCATCCAAAGCAATACACTGAAAATGTCTC...,16389,Orangutan
5,NC_011120,Gorilla gorilla gorilla,Western Lowland Gorilla,GTTTATGTAGCTTACCTCCCCAAAGCAATACACTGAAAATGTTTCG...,16412,Gorilla
6,KM242275,Gorilla beringei graueri,Eastern Lowland Gorilla,GTTTATGTAGCTTACCTCCCCAAAGCAATACACTGAAAATGTTTCG...,16416,Gorilla
7,NC_001644,Pan paniscus,Bonobo,GTTTATGTAGCTTACCCCCTTAAAGCAATACACTGAAAATGTTTCG...,16563,Chimp
8,JF727201,Pan troglodytes schweinfurthii,Chimp Schweinfurthii,GTTTATGTAGCTTACCCCCTCAAAGCAATACACTGAAAATGTTTCG...,16560,Chimp
9,KM679417,Pan troglodytes ellioti,Chimp Ellioti,GTTTATGTAGCTTACCCCCTCAAAGCAATACACTGAAAATGTTTCG...,16559,Chimp
