In [None]:
import csv

In [None]:
sequence_path = "../data/GRC38/GCF_000001405.40_GRCh38.p14_genomic.fna"
annotation_path = "../data/GRC38_ANNOTATIONS/gene_details.tsv"
protein_path = "../data/GRC38/protein.faa"

## Create a dictionary mapping gene name to accession (Sequence Record), start and end positions.

In [None]:
gene_details = {}

In [None]:
with open(annotation_path) as annotation_file:
    reader = csv.DictReader(annotation_file, delimiter="\t", quotechar='"')
    for row in reader:
       gene_details[row["Symbol"]] = (row["Accession"], row["Begin"], row["End"], row["Orientation"], row["Protein accession"])

In [None]:
gene_details["A1BG"]

## Create a dictionary mapping accession to sequence

In [None]:
from Bio import SeqIO

In [None]:
accession_to_sequence = {}

In [None]:
records = list(SeqIO.parse(sequence_path, "fasta"))

In [None]:
for record in records:
    accession_to_sequence[record.id] = record.seq

## Create a dictionary mapping protein accession to nucleotide sequence

In [None]:
protein_accession_to_sequence = {}

In [None]:
protein_records = list(SeqIO.parse(protein_path, "fasta"))

In [None]:
for record in protein_records:
    protein_accession_to_sequence[record.id] = record.seq

## Function that maps a gene to sequence

In [None]:
# Note - RefSeq soft masks repeat and low complexity regions by converting them to lower case. 
# I still want this data as it is valuable - Ignore masking by using .upper()
def get_sequence(gene_symbol):
    accession, start, end, orientation, protein_accession = gene_details[gene_symbol]
    # Note RefSeq Coordinates are 1 indexed, Convert to python 0 indexing
    return accession_to_sequence[accession][int(start)-1:int(end)].upper()

## Function that maps a gene to a protein sequence

In [None]:
# Note - RefSeq soft masks repeat and low complexity regions by converting them to lower case. 
# I still want this data as it is valuable - Ignore masking by using .upper()
def get_protein_sequence(gene_symbol):
    accession, start, end, orientation, protein_accession = gene_details[gene_symbol]
    # Note RefSeq Coordinates are 1 indexed, Convert to python 0 indexing
    return protein_accession_to_sequence[protein_accession]

In [None]:
print(get_protein_sequence("A1BG"))