In [2]:
import re
from collections import OrderedDict
import matplotlib.pyplot as plt

### read genome into a dictionary key=scaffold, value=sequence
genome = OrderedDict()
fh = open('output/chromsome_information.txt', 'wt')
fh.write('Name\tTotal_Length\tEffective_length\tN_length\tGC_length\tGC_rate(%)\n')

with open('scaffold.fa', 'rt') as f:
    scfd = ''
    seq = ''
    for line in f:
        line = line.rstrip()
        if line.startswith('>'):
            lst = line.split()
            genome[scfd] = seq
            scfd = lst[0]
            scfd = re.sub(r'>', r'', scfd)
            seq = ''
        else:
            line = line.upper()
            seq += line
    genome[scfd] = seq
    
    del genome['']

'''
### 1a. GC content analysis on each scaffold
for k, v in genome.items():
    total_len = len(v)
    eff_len = 0
    n = 0
    at = 0
    gc = 0

    for nt in v:
        if nt == 'A' or nt == 'T':
            at += 1
            eff_len += 1
        elif nt == 'G' or nt == 'C':
            gc += 1
            eff_len += 1
        elif nt == 'N':
            n += 1
    
    gc_rate = round(gc/(gc+at)*100, 2)
    
    fh.write(k+'\t'+str(total_len)+'\t'+str(eff_len)+'\t'+str(n)+'\t'+str(gc)+'\t'+str(gc_rate)+'\n')

fh.close()
     
    
### 1c. analyze the GC content in every 250bp non-overlapping sliding window
fh = open('output/gc_distribution.txt', 'wt')
fh.write('Scaffold\tStart\tEnd\tGC_num\tGC_content\n')

for k, v in genome.items():
    seq = re.sub(r'N+', r'', v)
    for i in range(250, len(seq), 250):
        fh.write(k+'\t')
        start = i-250+1
        end = i
        fh.write(str(start)+'\t'+str(end)+'\t')
        window = seq[i-250:i]
        gc = 0
        at = 0
        for j in range(250):
            if window[j] == 'G' or window[j] == 'C':
                gc += 1
            elif window[j] == 'A' or window[j] == 'T':
                at += 1

        gc_rate = round(gc/(gc+at)*100, 2)
        fh.write(str(gc)+'\t'+str(gc_rate)+'\n') 
    
    if len(seq)-i > 200 and len(seq)-i < 250:
        fh.write(k+'\t')
        start = i+1
        end = len(seq)
        fh.write(str(start)+'\t'+str(end)+'\t')
        window = seq[i:end]
        gc = 0
        at = 0
        for j in range(end-i):
            if window[j] == 'G' or window[j] == 'C':
                gc += 1
            elif window[j] == 'A' or window[j] == 'T':
                at += 1
        gc_rate = round(gc/(gc+at)*100, 2)
        fh.write(str(gc)+'\t'+str(gc_rate)+'\n')  
        
fh.close()
'''

### plot GC content curve
def gc_content(scaffold = 'scaffold16'):
    with open('output/gc_distribution.txt', 'rt') as f:
        x = []
        y = []
        for line in f:
            if line.startswith('Chr'):
                continue
            elif line.startswith(scaffold):
                lst = line.rstrip().split('\t')
                end = int(lst[2])
                gc_rate = float(lst[4])
            
                x.append(end)
                y.append(gc_rate)
    return x, y

position, gc = gc_content(scaffold = 'scaffold16')
seq = re.sub(r'N+', r'', genome['scaffold16'])

plt.plot(position,gc)
plt.xlim(250,len(seq))
plt.ylim(0, 100)
plt.xticks([250, 250000, 500000, 750000, len(seq)], ['Start', '25K', '50K', '75K', 'End'])
plt.xlabel('Scaffold16')
plt.ylabel('GC Content (%)')
plt.savefig('output/Scaffold16 GC content curve')
plt.show()

'''
### 1d. retrival of all cds sequence
cds = OrderedDict()
fh = open('output/cds_retrieval.fa', 'wt')
with open('scaffold.gff', 'rt') as f:
    for line in f:
        line = line.rstrip()
        lst = line.split()
        scfd = lst[0]
        start = int(lst[3])
        end = int(lst[4])
        if lst[2] == 'CDS':
            cds[line] = genome[scfd][start-1:end]
            
for k, v in cds.items():
    fh.write('>'+k+'\n')
    fh.write(v+'\n')
    
fh.close()
'''

<Figure size 640x480 with 1 Axes>

"\n### 1d. retrival of all cds sequence\ncds = OrderedDict()\nfh = open('output/cds_retrieval.fa', 'wt')\nwith open('scaffold.gff', 'rt') as f:\n    for line in f:\n        line = line.rstrip()\n        lst = line.split()\n        scfd = lst[0]\n        start = int(lst[3])\n        end = int(lst[4])\n        if lst[2] == 'CDS':\n            cds[line] = genome[scfd][start-1:end]\n            \nfor k, v in cds.items():\n    fh.write('>'+k+'\n')\n    fh.write(v+'\n')\n    \nfh.close()\n"

In [None]:
import subprocess as sp
from subprocess import call
from collections import OrderedDict, defaultdict

cds = defaultdict(list)
with open('selectedGene.gff', 'rt') as f:
    for line in f:
        line = line.rstrip()
        lst = line.split()
        if lst[2] == 'mRNA':
            mch1 = re.search(r'Gene_name=(.+);Transcript', lst[9])
            gene = mch1.group(1)
            mch2 = re.search(r'Transcript=(.+);$', lst[9])
            transcript = mch2.group(1)
            geneid = lst[0]+'\t'+gene+'\t'+transcript
            cds[geneid] = []
            continue
        elif lst[2] == 'CDS':
            cds[geneid].append((lst[3], lst[4]))
        
        
bam2sam = ('samtools view -h -o inputData.sort.sam inputData.sort.bam')
print (bam2sam)
call(bam2sam, shell = True)

fh = open('read_cds_overlap.txt', 'wt')
with open('inputData.sort.sam', 'rt') as f:
    for line in f:
        line = line.rstrip()
        lst = line.split()
        if lst[6] == '*':
            continue
        elif lst[6] == '=':
            chrread2 = lst[2]
        else:
            chrread2 = lst[6]
    
        for k, v in cds.items():
            yes = 0
            chrk, genek, transk = k.split()
            if chrread2 != chrk:
                continue
            else:
                for coord in v:
                    start = min(int(coord[0]), int(coord[1]))
                    end = max(int(coord[0]), int(coord[1]))
                    pos1 = int(lst[7])
                    pos2 = pos1+85
                    if (pos1 >= start and pos1 <= end) or (pos2 >= start and pos2 <= end):
                        yes = 1
            
            if yes == 1:        
                fh.write(lst[0]+'\t'+genek+'\n')

In [6]:
import re
x = 'atacagtagaAGATA'
y = re.sub(r'a+', r'', x)
print (y)

tcgtgAGATA


In [None]:
import matplotlib.pyplot as plt
with open('output/gc_distribution.txt', 'rt') as f:
    x = []
    y = []
    for line in f:
        if line.startswith('Chr'):
            continue
        elif line.startswith('scaffold16'):
            lst = line.rstrip().split('\t')
            end = int(lst[2])
            gc_rate = float(lst[4])
            
            x.append(end)
            y.append(gc_rate)
        seq = genome[lst[0]]

plt.plot(x,y)
plt.xlim(250,len(seq))
plt.ylim(0, 100)
plt.xticks([250, 250000, 500000, 750000, len(seq)], ['Start', '25K', '50K', '75K', 'End'])
plt.xlabel('Scaffold16')
plt.ylabel('GC Content (%)')
plt.savefig('output/Scaffold16 GC content curve')
plt.show()

In [None]:
from collections import OrderedDict
cds = OrderedDict()

fh = open('output/cds_retrieval.fa', 'wt')
with open('scaffold.gff', 'rt') as f:
    for line in f:
        line = line.rstrip()
        lst = line.split()
        scfd = lst[0]
        start = int(lst[3])
        end = int(lst[4])
        if lst[2] == 'CDS':
            cds[line] = genome[scfd][start-1:end].upper()
            
for k, v in cds.items():
    fh.write('>'+k+'\n')
    fh.write(v+'\n')
    
fh.close()
    

In [None]:
codons = {'TTT':'F', 'TTC':'F', 'TTA':'L', 'TTG':'L', 'CTT':'L', 'CTC':'L', 'CTA':'L', 'CTG':'L',
          'ATT':'I', 'ATC':'I', 'ATA':'I', 'ATG':'M', 'GTT':'V', 'GTC':'V', 'GTA':'V', 'GTG':'V',
          'TCT':'S', 'TCC':'S', 'TCA':'S', 'TCG':'S', 'CCT':'P', 'CCC':'P', 'CCA':'P', 'CCG':'P',
          'ACT':'T', 'ACC':'T', 'ACA':'T', 'ACG':'T', 'GCT':'A', 'GCC':'A', 'GCA':'A', 'GCG':'A',
          'TAT':'Y', 'TAC':'Y', 'TAA':'-', 'TAG':'-', 'CAT':'H', 'CAC':'H', 'CAA':'Q', 'CAG':'Q',
          'AAT':'N', 'AAC':'N', 'AAA':'K', 'AAG':'K', 'GAT':'D', 'GAC':'D', 'GAA':'E', 'GAG':'E',
          'TGT':'C', 'TGC':'C' ,'TGA':'-', 'TGG':'W', 'CGT':'R', 'CGC':'R', 'CGA':'R', 'CGG':'R',
          'AGT':'S', 'AGC':'S', 'AGA':'R', 'AGG':'R', 'GGT':'G', 'GGC':'G', 'GGA':'G', 'GGG':'G'
         }



In [78]:
with open('scaffold.fa', 'rt') as f:
    yes = 0
    for line in f:
        line = line.rstrip()
        if line.startswith('>scaffold101'):
            seq = ''
            yes = 1
            continue
        if yes == 1:
            if line.startswith('>'):
                break
            else:
                seq += line

print (seq[1861:1900])

TCAGCATGGCACTTTTTTCTGTAACTGTGTCAATTTGAG


In [None]:
from collections import defaultdict
import re
cds = defaultdict(list)
with open('selectedGene.gff', 'rt') as f:
    for line in f:
        line = line.rstrip()
        lst = line.split()
        if lst[2] == 'mRNA':
            mch = re.search(r'Gene_name=(.+);Transcript', lst[9])
            gene = mch.group(1)
            mch = re.search(r'Transcript=(.+);$', lst[9])
            transcript = mch.group(1)
            
            geneid = lst[0]+'\t'+gene+'\t'+transcript
            cds[geneid] = []
            continue
        elif lst[2] == 'CDS':
            cds[geneid].append((lst[3], lst[4]))


for k, v in cds.items():
    print (k, v)

In [6]:
len('GTCTCTATTAAAAATACAAAAAAATTAGCCGGGCATGGCGGTACATCCCTGTAACCCCAGCTACTCACAGGAAGAGGCAGTGCAC')

85

In [5]:
import re
gene = 'ID=ENSP00000326349; Gene=ENSG00000177489;Gene_name=OR2G2;Transcript=ENST00000320065;'

mch = re.search(r'Gene_name=(.+);Transcript', gene)
gene = mch.group(1)
print (gene)

OR2G2


In [44]:
na = ''
def main():
    global na
    for a in ['A', 'C', 'G', 'T']:
        na = ''
        ge()

def ge():
    global na
    na += 'A'
    print (na)

if __name__ == '__main__':
    main()

A
A
A
A


In [15]:
yes = 0
x = 3
if x == 1:
    print (x)
else:
    yes = 1
    
if yes == 1:
    print (3)

3


In [9]:
x = (1,2)
max(x)

2

In [30]:
round(0/3, 2)

0.0

In [36]:
v = 'atgcgacganngacg'
seq = re.sub(r'n+', r'', v)
print (seq[1:5])

for i in range(200, 1000, 200):
    print (i)

tgcg
200
400
600
800


In [31]:
for seq in genome['scaffold16 36.5']:
    if seq not in ['A','T','C','G','N']:
        print (seq)

In [None]:
def gc_content(scaffold = 'scaffold16'):
    with open('output/gc_distribution.txt', 'rt') as f:
        x = []
        y = []
        for line in f:
            if line.startswith('Chr'):
                continue
            elif line.startswith(scaffold):
                lst = line.rstrip().split('\t')
                end = int(lst[2])
                gc_rate = float(lst[4])
            
                x.append(end)
                y.append(gc_rate)
    return x, y

position, gc = gc_content(scaffold = 'scaffold16')
seq = genome['scaffold16']

plt.plot(position,gc)
plt.xlim(250,len(seq))
plt.ylim(0, 100)
plt.xticks([250, 250000, 500000, 750000, len(seq)], ['Start', '25K', '50K', '75K', 'End'])
plt.xlabel('Scaffold16')
plt.ylabel('GC Content (%)')
plt.savefig('output/Scaffold16 GC content curve')
plt.show()