In [2]:
#This is to calculate translation efficiency for genes with both EST and MS evidence
import pandas as pd
genes_EST= pd.read_table('../data/GenesByEST_Summary.txt',sep='\t',index_col='Gene_ID')
genes_MS= pd.read_table('../data/GenesByMassSpec_Summary.txt',sep='\t',index_col='Gene_ID')
#Read data from two spreadsheets and use column 'Gene_ID' as index
genes_EST5=genes_EST[genes_EST['Number_of_ESTs']>4]
genes_MS2=genes_MS[genes_MS['Total_Number_of_Spectra']>1]
#Select genes passing the minimum number threshold
genes_merged=pd.merge(genes_EST5,genes_MS2,right_index=True,left_index=True)
#Merge those two spreadsheets based on index column 
genes_EST_MS=genes_merged.ix[:,['Number_of_ESTs','Total_Number_of_Spectra']]
#Extract 3 columns from the merged DataFrame
Translation_efficiency=genes_EST_MS['Total_Number_of_Spectra']/genes_EST_MS['Number_of_ESTs']
Translation_efficiency.name='Translation_efficiency'
#Calculate translation efficiency as quotient of protein versus mRNA abundance, give the column a head
genes_EST_MS_info=pd.concat([genes_EST_MS,Translation_efficiency],axis=1)
#Add the efficiency Series to the right side of the abundance DataFrame
sorted_trans_effi=genes_EST_MS_info.sort_index(by='Translation_efficiency')
#Sort (rank) the gene_id rows based on translation efficiency
sorted_trans_effi.to_csv('../data/1_Original_fasta/genes_trans_effi.csv')
#Save the sorted DataFrame to a .csv file

In [1]:
#Add CAI values to genes with both EST and MS evidence
import pandas as pd
genes_CAI= pd.read_table('../data/4_CAI_values_CSV/CAI_EST5_and_MS2.txt',sep='\t')
#Read the spreadsheet with CAI info
Gene_ID=genes_CAI['NAME'].str.extract('(TVAG_[0-9]{6})')
Gene_ID.name='Gene_ID'
Protein_length=genes_CAI['NAME'].str.extract('Protein\s([0-9]+)\saa')
Protein_length.name='Protein_length'
#Extract gene_id and protein_length info from the first column of that DataFrame 
CAI_info=pd.concat([Gene_ID,Protein_length,genes_CAI['CAI']],axis=1)
#Join gene_id, protein_length and CAI columns together to a single DataFrame
genes_trans_effi=pd.read_csv('../data/0_Original_abundance/genes_trans_effi.csv')
#Read the .csv file generated above
trans_effi_CAI=pd.merge(genes_trans_effi,CAI_info,left_on='Gene_ID',right_on='Gene_ID')
#Merge two spreadsheets to a single one based on gene_id
trans_effi_CAI.to_csv('../data/4_CAI_values_CSV/trans_effi_CAI.csv')
#Save the merged DataFrame to a .csv file



In [9]:
#Add CAI values to genes with MS>=2 evidence only
import pandas as pd
genes_CAI= pd.read_table('../data/4_CAI_values_CSV/CAI_MS2_only.txt',sep='\t')
#Read the spreadsheet with CAI info
Gene_ID=genes_CAI['NAME'].str.extract('(TVAG_[0-9]{6})')
Gene_ID.name='Gene_ID'
Protein_length=genes_CAI['NAME'].str.extract('Protein\s([0-9]+)\saa')
Protein_length.name='Protein_length'
#Extract gene_id and protein_length info from the first column of that DataFrame 
CAI_info=pd.concat([Gene_ID,Protein_length,genes_CAI['CAI']],axis=1)
#Join gene_id, protein_length and CAI columns together to a single DataFrame
MS_info=pd.read_table('../data/0_Original_abundance/GenesByMassSpec_Summary.txt',sep='\t')
#Read the spreadsheet with protein abundance info
MS_CAI=pd.merge(MS_info,CAI_info,left_on='Gene_ID',right_on='Gene_ID')
#Merge two spreadsheets to a single one based on gene_id
MS_CAI_output=MS_CAI[['Gene_ID','Total_Number_of_Spectra','Protein_length','CAI']]
MS_CAI_output.to_csv('../data/4_CAI_values_CSV/MS2_only_CAI.csv')
#Save the merged DataFrame to a .csv file



In [12]:
#Add CAI values to genes with EST >=5 evidence only
import pandas as pd
genes_CAI= pd.read_table('../data/4_CAI_values_CSV/CAI_EST5_only.txt',sep='\t')
#Read the spreadsheet with CAI info
Gene_ID=genes_CAI['NAME'].str.extract('(TVAG_[0-9A-Z_]+)\s')
Gene_ID.name='Gene_ID'
Protein_length=genes_CAI['NAME'].str.extract('Protein\s([0-9]+)\saa')
Protein_length.name='Protein_length'
#Extract gene_id and protein_length info from the first column of that DataFrame 
CAI_info=pd.concat([Gene_ID,Protein_length,genes_CAI['CAI']],axis=1)
#Join gene_id, protein_length and CAI columns together to a single DataFrame

EST_info=pd.read_table('../data/0_Original_abundance/GenesByEST_Summary.txt',sep='\t')
#Read the spreadsheet with protein abundance info
EST_CAI=pd.merge(EST_info,CAI_info,left_on='Gene_ID',right_on='Gene_ID')
#Merge two spreadsheets to a single one based on gene_id
EST_CAI_output=EST_CAI[['Gene_ID','Number_of_ESTs','Protein_length','CAI']]
EST_CAI_output.to_csv('../data/4_CAI_values_CSV/EST5_only_CAI.csv')
#Save the merged DataFrame to a .csv file



In [14]:
#Add CAI values to genes with EST or MS evidence
import pandas as pd
genes_CAI= pd.read_table('../data/4_CAI_values_CSV/CAI_EST5_or_MS2.txt',sep='\t')
#Read the spreadsheet with CAI info
Gene_ID=genes_CAI['NAME'].str.extract('(TVAG_[0-9A-Z_]+)\s')
Gene_ID.name='Gene_ID'
Protein_length=genes_CAI['NAME'].str.extract('Protein\s([0-9]+)\saa')
Protein_length.name='Protein_length'
#Extract gene_id and protein_length info from the first column of that DataFrame 
CAI_info=pd.concat([Gene_ID,Protein_length,genes_CAI['CAI']],axis=1)
#Join gene_id, protein_length and CAI columns together to a single DataFrame
CAI_info.to_csv('../data/EST5_or_MS2_CAI.csv')
#Save the merged DataFrame to a .csv file



In [1]:
#Report Gene_ID and promoter_type after M5_Inr sorting
from Bio import SeqIO
from Bio.Alphabet import IUPAC
import numpy as np

ID=['Gene_ID']
promoter=['promoter_type']

for rec in SeqIO.parse('TSS_100CDS_valid_EST5_and_MS2.fasta', "fasta",IUPAC.unambiguous_dna):
    ID.append(rec.id)
    promoter.append(rec.description[12:-11])
ID_arr=np.array(ID)
promoter_arr=np.array(promoter)
output=np.vstack((ID_arr,promoter_arr))
np.savetxt('ID_promoter_EST5_and_MS2.txt',output.T,fmt='%s',delimiter='\t')    
    

In [3]:
#Extract necessary info of sequences with top or bottom 200 translation efficiency.
import pandas as pd
genes_EST_MS= pd.read_table('../data/8_top_bottom_200_effi/ID_promoter_EST5_and_MS2.txt',sep='\t',index_col='Gene_ID')
genes_m5_Inr= pd.read_csv('../data/4_CAI_values_CSV/trans_effi_CAI.csv',index_col='Gene_ID')
#Read data from two spreadsheets and use column 'Gene_ID' as index
genes_merged=pd.merge(genes_EST_MS,genes_m5_Inr,right_index=True,left_index=True)
#Merge those two spreadsheets based on index column 
effi_bottom_200=genes_merged.ix[0:200,['Translation_efficiency','promoter_type']]
effi_top_200=genes_merged.ix[-200:,['Translation_efficiency','promoter_type']]
effi_top_200.to_csv('../data/8_top_bottom_200_effi/effi_top_200.csv')
effi_bottom_200.to_csv('../data/8_top_bottom_200_effi/effi_bottom_200.csv')
#Save the sorted DataFrame to a .csv file

In [1]:
#Add MFE values of the first 40nt and key window to each gene with both MS and EST evidence.
import pandas as pd
effi= pd.read_csv('../data/4_CAI_values_CSV/trans_effi_CAI.csv',index_col='Gene_ID')
MFE= pd.read_csv('../data/5_TSS_100CDS_general_trend_GC/values_MFE_5terminus_key.csv',index_col='Gene_ID')
#Read data from two spreadsheets and use column 'Gene_ID' as index
effi_MFE=pd.merge(effi,MFE,right_index=True,left_index=True)
#Merge those two spreadsheets based on index column 
effi_MFE.ix[:,['Translation_efficiency','MFE_5terminus','MFE_key','CAI']].to_csv('../data/4_CAI_values_CSV/trans_effi_MFE_CAI.csv')
#Save the sorted DataFrame to a .csv file