Yeast codon task - Bioinformatics Task 1

Element 1 : Read and interpret a fasta file containing coding sequences from this yeast genome. You must do this within some constraints. The file must be processed line by line. You cannot use any special bioinformatics or data analysis software. You must tabulate the genomic features of gene length in nucleotides and amino acids. Report any discrepancies (e.g. partial codons). You should also report the count of coding direction for all the genes (something like... coding 1021; complement 575; total genes 1596).

This element has 9 marks associated with it (6 conceptual, 3 implementation) – namely, is the file opened correctly and is each entry processed into manageable data (i.e. the required data components are stored somewhere useful like a list) and is the file correctly handled (e.g. processed line by line).

In [1]:
#import the libraries that we need 

import numpy as np 
import re
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
import csv


#Create lists for storing the genetic information and the DNA sequence

yeast_info = []
dna_seq = ''

#### Opening the fasta file and saving it to a variable


with open('yeast_fasta_full', newline='') as fasta_file:
    
    #read the file line by line by looping
    for line in fasta_file:
    
    
        #seperate based wether data is genetic info or DNA, identified by a > at the start
        if line[0] == '>':
            
            yeast_info.append(line)

        # Collect and store the dna_sequence for processing                        
            
        else:
            
            if line[0] != '>':
                
                line = line.strip()   
                dna_seq += line
                
            else:
                
                dna_seq =''    

You should also report the count of coding direction for all the genes (something like... coding 1021; complement 575; total genes 1596).

In [2]:
#count the occurence of keywords for genetic information (complement, intro(join)) in the string
count_intron = 0 
count_complement = 0 
count_total = 0 

#find the keywords using re

find_comp = re.findall(r'complement', str(yeast_info))
for match in find_comp:
    count_complement = count_complement + 1
    
find_intron = re.findall(r'join', str(yeast_info))
for match in find_intron:
    count_intron = count_intron + 1
    
find_all = re.findall(r'location', str(yeast_info))
for match in find_all:
    count_total = count_total + 1

#we cant search for coding segement so we wil find all the locations and then minus the two others. 

count_coding = count_total - (count_intron + count_complement)

#count the total amount of nucleotides, amino acids and check for partial codons. 

length_aa = (len(dna_seq) // 3)
partial_codons = (len(dna_seq) % 3)
nucleotide_len = (len(dna_seq))


#write the function for tabulating data

def tabulate(name, data, table_name):
    
    print(table_name)
    for title, region in zip(name, data):
        
        print("{}|{}\t".format (title, (region)))
        


#Tabulaute the data for the coding direction, making the name, data, table_name

regions_list = [count_coding, count_complement, count_intron, count_total]

dir_names_list = ['Count of coding genes    ', 'Count of complement genes', 'Count of intron genes    ', 'Count of total genes     ']

codingdir_table_name = 'Table to show the number of different coding direction of the yeast genome\n'



#Tabulate the data for nucleotide length and partial codons, making the name, data, table_name 

gene_info = [nucleotide_len, length_aa, partial_codons]

gene_info_name_list = ['Nucleotide number       ', 'Number of Amino acids   ', 'Number of partial codons']

gene_name = 'Table to show the number of nucleotides, amino acids and partial codons of the yeast genome\n'
        
# print out the data in a table format 
        
tabulate(gene_info_name_list, gene_info, gene_name)

print('')
tabulate(dir_names_list, regions_list, codingdir_table_name) 

#save the tabulated data to a csv file 
print('')


#save the tabulated data to a file 

with open('Gene_Info_Yeast_1_NDS.txt', 'w', newline='') as Outf:
    writer = csv.writer(Outf)
    data_list = [['Nucleotide number', 'Number of Amino acids', 'Number of partial codons'], 
                 ['460887           ', '153629               ', '0                      ']]
    writer = csv.writer(Outf)
    Outf.write('Table for Genetic information\n')
    writer = csv.writer(Outf, delimiter='|')
    writer.writerows(data_list)


with open('Gene_Info_Yeast_1_NDS.txt', 'r') as filer:
    for line in filer:
        print(line)

Outf.close()
filer.close()

print('')


#save the tabulated data to a file 

with open('Coding_Info_Yeast_1_NDS.txt', 'w', newline='') as C_Outf:
    C_writer = csv.writer(C_Outf)
    C_data_list = [['Nucleotide number', 'Number of Amino acids', 'Number of partial codons'], 
                 ['460887           ', '153629               ', '0                      ']]
    C_writer = csv.writer(C_Outf)
    C_Outf.write('Table for Coding direction information\n')
    C_writer = csv.writer(C_Outf, delimiter='|')
    C_writer.writerows(data_list)


with open('Coding_Info_Yeast_1_NDS.txt', 'r') as C_filer:
    for line in C_filer:
        print(line)

C_Outf.close()
C_filer.close()


Table to show the number of nucleotides, amino acids and partial codons of the yeast genome

Nucleotide number       |460887	
Number of Amino acids   |153629	
Number of partial codons|0	

Table to show the number of different coding direction of the yeast genome

Count of coding genes    |156	
Count of complement genes|169	
Count of intron genes    |6	
Count of total genes     |331	

Table for Genetic information

Nucleotide number|Number of Amino acids|Number of partial codons

460887           |153629               |0                      


Table for Coding direction information

Nucleotide number|Number of Amino acids|Number of partial codons

460887           |153629               |0                      



You must tabulate the genomic features of gene length in nucleotides and amino acids. Report any discrepancies (e.g. partial codons)

Element 2 –  Tabulate and report the total occurrence of all encoded serine and leucine amino acids.
This element has 4 marks associated with it (2 conceptual, 2 implementation).


In [3]:
#Note down all the codons that code for the amino acids Serine and Leucine

leucine_codonsRNA = ('UUA', 'UUG', 'CUU', 'CUC', 'CUA')
serine_codonsRNA = ('UCU', 'UCC', 'UCA', 'UCG', 'AGU', 'AGC')
shared_codonRNA = ('CUG')

#we have a sequence given in DNA so we will convert the U' to T's.

shared_codon = ('CTG')
serine_codons = ('TCT', 'TCC', 'TCA', 'TCG', 'AGT', 'AGC')
leucine_codons = ('TTA', 'TTG', 'CTT', 'CTC', 'CTA')

#count the amount of leucines 

l_counter = 0
amino = [dna_seq[i:i+3] for i in range(0, len(dna_seq), 3)]
for base in amino:
        
    if base == 'TTA' or base =='TTG' or base == 'CTA' or base == 'CTC' or base =='CTT':
            
        l_counter = l_counter + 1 
    
    
#count the amount of serines

s_counter = 0
amino = [dna_seq[i:i+3] for i in range(0, len(dna_seq), 3)]
for base in amino:
        
    if base == 'TCT' or base =='TCC' or base == 'TCA' or base == 'AGT' or base =='TCG' or base == 'AGC':
            
        s_counter = s_counter + 1



#count the shared codon 

shared_counter = 0
amino = [dna_seq[i:i+3] for i in range(0, len(dna_seq), 3)]
for base in amino:
        
    if base == 'CTG':
            
        shared_counter = shared_counter + 1
 

# Generate lists containing data needed for tables

amino_acid_data = [s_counter, l_counter, shared_counter]

aa_names = ['Serine ', 'Leucine', 'Shared ']
aa_table_name = 'Table to show the amount of encoded Serines and Leucines\n'

#Tabulaute the data
# def tablulate(name, data, table_name):

tabulate(aa_names, amino_acid_data, aa_table_name)
print('')


#save the tabulated data to a file 

with open('AminoAcid_Info_Yeast_1_NDS.txt', 'w', newline='') as AA_Outf:
    AA_writer = csv.writer(AA_Outf)
    aa_data_list = [['Serine', 'Leucine', 'Shared '], 
                    ['12570 ', '14232  ', '1767   ']]
    AA_writer = csv.writer(AA_Outf)
    AA_Outf.write('Table to show the amount of encoded Serines and Leucines\n')
    AA_writer = csv.writer(AA_Outf, delimiter='|')
    AA_writer.writerows(aa_data_list)


with open('AminoAcid_Info_Yeast_1_NDS.txt', 'r') as AA_filer:
    for line in AA_filer:
        print(line)

AA_Outf.close()
AA_filer.close()


Table to show the amount of encoded Serines and Leucines

Serine |12570	
Leucine|14232	
Shared |1767	

Table to show the amount of encoded Serines and Leucines

Serine|Leucine|Shared 

12570 |14232  |1767   



Element 3 – Make a plot that compares the frequency of CUG to the other codons of serine and leucine. What are some plausible biological arguments for differences in codon frequency? Suggest one additional analysis that either addresses CUG usage or something else you observed from these exercises.
This element has 7 marks associated with it (5 conceptual, 2 implementation)


In [4]:
#lets remind ourelves of the bases we are looking for

shared_codon = ('CTG')
serine_codons = ('TCT', 'TCC', 'TCA', 'TCG', 'AGT', 'AGC')
leucine_codons = ('TTA', 'TTG', 'CTT', 'CTC', 'CTA')

#shared codon 
    
freq_ctg = 0

#serine codon 
    
freq_tct = 0
freq_tcc = 0
freq_tca = 0
freq_tcg = 0
freq_agt = 0
freq_agc = 0
    
#leucine codons 

freq_tta = 0
freq_ttg = 0
freq_ctt = 0
freq_ctc = 0
freq_cta = 0

amino = [dna_seq[i:i+3] for i in range(0, len(dna_seq), 3)]
for base in amino:
    
    if base == 'CTG':
                
        freq_ctg += 1
                
    elif base == 'TCT':
        
        freq_tct += 1
    elif base == 'TCC':
        
        freq_tcc +=1 
    elif base == 'TCA':
        
        freq_tca +=1
    elif base == 'TCG':
        
        freq_tcg +=1
    elif base == 'AGT':
        
        freq_agt +=1
    elif base == 'AGC':
        
        freq_agc +=1
            
#leucine codons now 
    elif base == 'TTA':
        
        freq_tta +=1
    elif base == 'TTG':
        
        freq_ttg += 1
    elif base == 'CTT':
        
        freq_ctt +=1
    elif base == 'CTC':
        
        freq_ctc +=1
    elif base == 'CTA':
        
        freq_cta +=1

        
# gather together the data for plotting
all_codon_plot = (freq_ctg, freq_tct, freq_tcc, freq_tca, freq_tcg, freq_agt, freq_agc, freq_tta, freq_ttg,
          freq_ctt, freq_ctc, freq_cta  )           

#turn the data into arrays for processing
all_codon_array = np.array(all_codon_plot)
string_codons = 'CTG','TCT', 'TCC', 'TCA', 'TCG', 'AGT', 'AGC', 'TTA', 'TTG', 'CTT', 'CTC', 'CTA'
codon_ord_list = np.array(string_codons)

#join data into one array
joined_data = (all_codon_array, codon_ord_list)

#working out percentage of codon in total amino acid length 
percentage_freqs = (all_codon_array / length_aa)
percentage_freqs

array([0.01150173, 0.01658541, 0.01277103, 0.01322016, 0.01661145,
       0.00943832, 0.01319412, 0.00969218, 0.03806573, 0.01880504,
       0.01805649, 0.00801932])

In [17]:
#plot all the data onto a figure 

fig = px.scatter(
    x=codon_ord_list, 
    y=percentage_freqs,
    color = codon_ord_list,
    size = [2,2,2,2,2,2,2,2,2,2,2,2],
    title = 'Graph to show the frequency of codons used for Serine and Leucines in a Yeast genome'
)

fig.update_layout(showlegend=True, 
                  xaxis_type='category', xaxis_title='Codon base in Yeast genes',
                 yaxis_title='Frequency of codon againist total codons %',
                 legend_title='Codon') 

#adding a vertical line to sepearate data based on serine or leucine

fig.add_shape(
dict(
    type = 'line', 
x0 = 0.68,
y0 = 0,
x1 = 0.68,
y1 = 0.055,
    line = dict( 
    color = 'MediumPurple',
    width = 3)
))

#adding a vertical line  to sepearate data based on serine or leucine

fig.add_shape(
dict(
    type = 'line', 
x0 = 6.5,
y0 = 0,
x1 = 6.5,
y1 = 0.055,
    line = dict( 
    color = 'LightSeaGreen',
    width = 3)
))

# Create scatter trace of text labels

fig.add_trace(go.Scatter(
    x=['CTG','TCC', 'TTG'],
    y=[0.05, 0.05, 0.05],
    text=['Shared codon',
          "Serine codons",
          "Leucine codons"],
    mode="text"
))