# API

In [1]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd

### GenscanOutput class

In [2]:
class GenscanOutput:
    def __init__(self, status, cds_list, intron_list, exon_list):
        self.status = status
        self.cds_list = cds_list
        self.intron_list = intron_list
        self.exon_list = exon_list

### run_genscan function

In [3]:
def run_genscan(sequence=None, sequence_file=None, organism="Vertebrate",exon_cutoff=1.00, sequence_name="", print_options='Predicted peptides only'):
    
    form_url = 'http://hollywood.mit.edu/cgi-bin/genscanw_py.cgi'

    if sequence_file is not None:
        sequence_file = open(sequence_file, "rb")
        files = {'-u': sequence_file}

        payload = {
            '-o': organism,
            '-e': exon_cutoff,
            '-n': sequence_name,
            '-p': print_options,
        }

        resp = requests.post(form_url, data=payload, files=files)
        sequence_file.close()

    else:
        payload = {
            '-o': organism,
            '-e': exon_cutoff,
            '-n': sequence_name,
            '-p': print_options,
            '-s': sequence
        }

        resp = requests.post(form_url, data=payload)
        
    status = resp.status_code

    soup = str(BeautifulSoup(resp.content).find('pre').text)

    # peptides
    peptides = []

    for peptide in re.compile(r'aa\n[A-Y\n]+').findall(soup):
        peptides.append(peptide[2:].replace('\n', ''))

    # exons
    exon_start = []
    exon_end = []

    for exon in re.compile(r'1\.\d\d.+\d').findall(soup):
        exon_start.append(int(exon[11:18]))
        exon_end.append(int(exon[18:25]))
    
    exons = pd.DataFrame({'exon start':exon_start, 'exon end':exon_end}, columns = ['exon start', 'exon end'])

    # introns
    intron_start = [0]
    intron_end = []
    length = int(re.compile(r'fasta.+bp').findall(soup)[0][7:-2])

    for i in range(len(exon_start)):
        intron_end.append(exon_start[i]-1)
        intron_start.append(exon_end[i]+1)
        
    intron_end.append(length)
    
    introns = pd.DataFrame({'intron start':intron_start, 'intron end':intron_end}, columns = ['intron start', 'intron end'])

    return GenscanOutput(status = status, cds_list = peptides, intron_list = introns, exon_list = exons)

### Testing using test data

In [4]:
test = run_genscan(sequence=None, sequence_file='./test_data/TP53.fna', organism="Vertebrate",exon_cutoff=0.80, sequence_name="", print_options='Predicted peptides only')

In [5]:
test.status

200

In [6]:
test.cds_list

['XSQTAFRVTAMEEPQSDPSVEPPLSQETFSDLWKLLPENNVLSPLPSQAMDDLMLSPDDIEQWFTEDPGPDEAPRMPEAAPPVAPAPAAPTPAAPAPAPSWPLSSSVPSQKTYQGSYGFRLGFLHSGTAKSVTCTYSPALNKMFCQLAKTCPVQLWVDSTPPPGTRVRAMAIYKQSQHMTEVVRRCPHHERCSDSDGLAPPQHLIRVEGNLRVEYLDDRNTFRHSVVVPYEPPEVGSDCTTIHYNYMCNSSCMGGMNRRPILTIITLEDSSGNLLGRNSFEVRVCACPGRDRRTEEENLRKKGEPHHELPPGSTKRALPNNTSSSPQPKKKPLDGEYFTLQIRGRERFEMFRELNEALELKDAQAGKEPGGSRAHSSHLKSKKGQSTSRHKKLMFKTEGPDSD']

In [7]:
test.exon_list

Unnamed: 0,exon start,exon end
0,10909,11010
1,11128,11149
2,11259,11537
3,12295,12478
4,12560,12672
5,13241,13350
6,13694,13830
7,13923,13996
8,16816,16922
9,17841,17922


In [8]:
test.intron_list

Unnamed: 0,intron start,intron end
0,0,10908
1,11011,11127
2,11150,11258
3,11538,12294
4,12479,12559
5,12673,13240
6,13351,13693
7,13831,13922
8,13997,16815
9,16923,17840
