# Python Script

#### installing biopython

In [None]:
conda install biopython

#### importing necessary packages

In [None]:
from Bio import SeqIO
from Bio.Data import CodonTable
import pandas as pd
from Bio.Seq import Seq

#### #1

In [None]:
def get_sequences_from_file(fasta_fn): #defining a function titled "get_sequences_from_file"
    sequence_data_dict = {} #creating a dictionary 
    for record in SeqIO.parse(fasta_fn, "fasta"): #obtaining sequences in file fasta_fn and beginning a for loop that loops through each value "record"
        description = record.description.split() #splitting descriptions string in record into different lists based on spaces
        species_name = description[1] + " " + description[2] #adding new list titled "species name" that consists of index 1 and 2 of "description"
        sequence_data_dict[species_name] = record.seq #sets species name as key and record.seq as value
    return(sequence_data_dict) #this returns the dictionary made by the above code 

#### #2

In [None]:
def translate_function(string_nucleotides): #defining a function titled "translate_function"
    mito_table = CodonTable.unambiguous_dna_by_name["Vertebrate Mitochondrial"] #creating a list title "mito_table" from the vetebrate mitochondrial codon table
    aa_seq_string = "" #creating an empty string
    for x in range(0, len(string_nucleotides),3): #looping through "string_nucleotides" at every 3rd item
        codon = seq[i:i +3] #separating out the codons. each codon is 3 units long
        if codon in mito_stable.stop_codons: #if the codon ends in a stop codon, stop the for loop
            break
        else:
            aa = mito_table.forward_table[codon] #defines amino acid based on codon table
            aa_seq_string += aa #adds aa to aa_seq_string 
        return aa_seq_string #returns the aa_seq_string
            
            
#I used this website : https://www.geeksforgeeks.org/dna-protein-python-3/ 
 #to help with the above code

#### #3

In [None]:
def different_translate_function(string_nucleotides): #defining a function titled "different_translate_function"
    mito_table = CodonTable.unambiguous_dna_by_name["Vertebrate Mitochondrial"] #creating "mito_table" from the vetebrate mitochondrial codon table
    my_seq = Seq(string_nucleotides) #creating a seq object of my string of nucleotides
    aa_seq_string = my_seq.translate(table = mito_table, to_stop=True) #translating nucleotides from "my_seq" using mito_table and breaking the loop for stop codons
    return(aa_seq_string) #returns aa_seq_string

#I used this website:https://biopython.org/docs/1.75/api/Bio.Seq.html
#to help with this code

#### #4


In [None]:
#importing necessary packages for the follwoing code 
from Bio.SeqUtils.ProtParam import ProteinAnalysis

In [None]:
def compute_molecular_weight(aa_seq): #defining a function titled compute_molecular_weight
    aa_seq_string = str(aa_seq) #turns aa_seq into a string
    protein = ProteinAnalysis(aa_seq_string) #creates a protein sequence
    molecular_weight = protein.molecular_weight() #calculatres molecular weight from protein sequence
    return molecular_weight #returns molecular weight

#I used this website: https://biopython.org/docs/1.76/api/Bio.SeqUtils.ProtParam.html
#to help with the code. 

#### #5

In [None]:
def calculate_gc_content(string_nucleotides): #defines function titled calculate_gc_content
    gc = string_nucleotides.count("G") + string_nucleotides.count("C") #counts the number of times "g" and "c" appear
    seq_length = len(string_nucleotides) #the length of string_nucleotides
    gc_content = gc/seq_length #calculates the proportion of G and C 
    return gc_content #returns gc content

#### main part of code

In [None]:
cytb_seqs = get_sequences_from_file("penguins_cytb.fasta") #uses above function "get_sequences_from_file" to get sequences from this file
penguins_df = pd.read_csv("penguins_mass.csv") # Includes only data for body mass 
species_list = list(penguins_df.species) #turning species into a list

#### #6

In [None]:
penguins_df["Molecular_Weight"] = 'NaN' #adds an empty column titled "molecular weight" filled with NaN values
penguins_df["GC_Content"] = 'NaN' #adds an empty column titled "gc_content" filled with NaN values

penguins_df

#I used this website: https://www.geeksforgeeks.org/how-to-add-empty-column-to-dataframe-in-pandas/
#to help with this code

#### #7

In [None]:
gc_content_penguins = [] #creates an empty list
molecular_weight_penguins = [] #creates an empty list
aa_seq_list = [] #creates an empty list

for species_name, sequence in cytb_seqs.items():
    sequence_string = str(sequence) #turns sequence into a string
    aa_seq = different_translate_function(sequence_string) #translates the string 
    gc_content_penguins.append(calculate_gc_content(sequence_string)) #calculates gc content and adds to gc_content list
    molecular_weight_penguins.append(compute_molecular_weight(sequence_string)) #calculates molecular weight and adds to molecular weight list


penguins_df_2 = penguins_df.assign(GC_Content = gc_content_penguins, Molecular_Weight = molecular_weight_penguins) #assigns the lists to column names in penguin dataframe
penguins_df_2

#I used this website to figure out how to append data to a list: https://stackoverflow.com/questions/56321765/append-values-from-dataframe-column-to-list
#and this one to assign the lists to the data frame: https://www.geeksforgeeks.org/python-pandas-dataframe-assign/#

#### #8

In [None]:
conda install seaborn #installing seaborn for figures

In [None]:
import matplotlib.pyplot as plt #importing necessary packages
import seaborn as sns

In [None]:
#bar chart of the mass with x axis labeled as species names
plt.bar("species","mass", data = penguins_df_2, color = "purple") #this makes the bar plot, specifying mass on the y axis, species on the x axis, and a purple graph
plt.xlabel("Species") #adds title for x axis
plt.ylabel("Mass(g)") #adds title for y axis
plt.title("Mass by Species") #adds title for graph
plt.xticks(rotation = 90) #rotates species names on x axis

#I used these two websites to help make the graph: https://www.geeksforgeeks.org/bar-plot-in-matplotlib/
#and https://stackoverflow.com/questions/10998621/rotate-axis-text-in-matplotlib

Q1: What is the smallest penguins species?
A1: Eudyptula minor

Q2: What is the geographical range of this species?
A1: This species is found throughout the southern coast of Australia and as far north as South Solitary Island. They are also found on New Zealand and the Chatham Islands.

source:
Hoskins, A. J., Dann, P., Ropert-Coudert, Y., Kato, A., Chiaradia, A., Costa, D. P., & Arnould, J. P. Y. (2008). Foraging behaviour and habitat selection of the little penguin Eudyptula minor during early chick rearing in Bass Strait, Australia. Marine Ecology Progress Series, 366, 293–303. http://www.jstor.org/stable/24872877 

#### #9

In [None]:
#plot of molecular weight (y-axis) and GC content (x-axis)
sns.lmplot(x = "GC_Content", y = "Molecular_Weight", data = penguins_df_2,
          markers = "D")


#### #10

In [None]:
penguins_df_2.to_csv('penguins_mass_cytb.csv', index=False) #saving new data frame to a csv file

#I used this website to help with the code: https://datatofish.com/export-dataframe-to-csv/

#### #11

In [None]:
#bonus figure!

sns.lmplot(x = "mass", y = "GC_Content", data = penguins_df_2,
          markers = "D", fit_reg = False)

In [None]:
#pushing to github!