In [1]:
%%capture
!pip install SeqIO
!pip install Bio

In [2]:
import pandas as pd
from Bio import SeqIO
from Bio.Seq import Seq

# Reading and sorting dataset

In [4]:
# -- Function for reading Dfam dataset in csv format
def read_dataset(input_path):
    df = pd.read_csv(input_path, low_memory = False)   # Low_memory argument for reading whole file
    return df 

In [6]:
# -- Read dataset (locally) 
path = "/Users/leandrojorqueravalero/Desktop/PhD/synthetic_ORF2/data/preprocessed_LINE_v2.csv"
df=read_dataset(path)
len(df.index)   # Check complete reading 

100000

In [8]:
# -- Select desired columns and rename

sorted_df = df.iloc[:,[19, 3,8, 18, 16]]   # 3: length  // 8:DNA_seq // 16: species data // 18: counts info // 19:ID + Dfam entry
sorted_df.columns = ['ID','Length','DNA_seq', 'Counts', 'Species']
#sorted_df.head(5)

# -- Select columns matching required criteria (length)
filtered_df = sorted_df.loc[sorted_df['Length']>3000]
#len(filtered_df.index)

# -- removing duplicates
filtered_df.drop_duplicates(subset=['ID'], keep=False)
len(filtered_df.index)

26431

# Retrieving species name

In [10]:
taxonomy = filtered_df['Species']
name = filtered_df['ID'] 
species_dict = {}
for i in range(len(filtered_df)):     # len(filtered_df)
    ID = name.iloc[i]
    species = taxonomy.iloc[i]
    #print(ID, species)
    split = species.split(";")
    #print(split)
    spec_name = split[-1]
    clean_name = spec_name.strip("']")
    species_dict[ID] = clean_name

# -- Show ouput dict
species_dict

# Make a df and ID column is set as index
taxo_df = pd.DataFrame.from_dict(species_dict, orient='index') 
L1_taxo = taxo_df.reset_index() 
L1_taxo.columns = ['ID', 'Taxonomy']
L1_taxo

Unnamed: 0,ID,Taxonomy
0,L1M5_orf2DF0000008,Eutheria
1,L1M1_5endDF0000227,Primates
2,L1M1_orf2DF0000228,Primates
3,L1M2_orf2DF0000236,Eutheria
4,L1M3_orf2DF0000244,Eutheria
...,...,...
1353,TE3262_SO2_FAM455DF0289703,Sitophilus oryzae
1354,TE3263_SO2_FAM455DF0289704,Sitophilus oryzae
1355,TE3264_SO2_FAM465DF0289705,Sitophilus oryzae
1356,TE3265_SO2_FAM465DF0289706,Sitophilus oryzae


In [21]:
eukarya_classes = ['Mammalia', 'Aves', 'Reptilia', 'Actinopterygii', 'Amphibia', 'Insecta', 'Fungi', 'Plantae']

In [44]:
taxonomy = filtered_df['Species']
name = filtered_df['ID'] 
species_dict = {}

for i in range(10):
    ID = name.iloc[i]
    spec = taxonomy.iloc[i]
    parts = spec.split(";")
    for word in parts:
        if word in eukarya_classes:
            taxo_dict[ID] = word
#taxo_dict
#len(taxo_dict.keys())

# Make a df and ID column is set as index
taxo_info = pd.DataFrame.from_dict(taxo_dict, orient='index') 
L1_taxo = taxo_info.reset_index() 
L1_taxo.columns = ['ID', 'Taxonomy']
L1_taxo

Unnamed: 0,ID,Taxonomy
0,L1P1_orf2DF0000316,Mammalia
1,L1M5_orf2DF0000008,Mammalia
2,L1M1_5endDF0000227,Mammalia
3,L1M1_orf2DF0000228,Mammalia
4,L1M2_orf2DF0000236,Mammalia
...,...,...
1353,RTE1DF0001521,non_listed
1354,LINE2C1_CEDF0004135,non_listed
1355,L1-1_TestuDF0004210,non_listed
1356,RTE-1_TestuDF0004211,non_listed


# Retrieving count number (1357 out of 26431 are valid)

In [45]:
# select desired columns and create empty output dict
name = filtered_df['ID']  
counts = filtered_df['Counts']
output_dict = {}

# -- testing the number of column with NaN values (only 1357 out of 26431 have valid values)
#nonNA_counts = filtered_df['Counts'].dropna()
#len(nonNA_counts)

# iterating over dataset
for i in range(len(filtered_df)):
    ID = name.iloc[i]
    if type(counts.iloc[i]) == str: # must be string to be converted to dict
      hmm_dict = eval(counts.iloc[i])
      count_list = list(hmm_dict.values())  
      count_dict = eval(str(count_list[0])) 
      all = count_dict['gathering_all']
      non_redun = count_dict['gathering_nonredundant']
      #print(ID , all, non_redun)
      output_dict[ID] = (all,non_redun) # associate both number to each ID

# Display output dict
#output_dict

# Make a df and ID column is set as index
count_df = pd.DataFrame.from_dict(output_dict, orient='index') 
#count_df

# Resetting index to displace ID col
ORF2p_counts = count_df.reset_index()

# Renaming columns
ORF2p_counts.columns = ['ID','All', 'Non_redundant']
ORF2p_counts

Unnamed: 0,ID,All,Non_redundant
0,L1M5_orf2DF0000008,337941,114873
1,L1M1_5endDF0000227,96690,11185
2,L1M1_orf2DF0000228,277575,11129
3,L1M2_orf2DF0000236,296693,17500
4,L1M3_orf2DF0000244,299220,21747
...,...,...,...
1352,TE3262_SO2_FAM455DF0289703,12387,840
1353,TE3263_SO2_FAM455DF0289704,12914,900
1354,TE3264_SO2_FAM465DF0289705,3843,285
1355,TE3265_SO2_FAM465DF0289706,1472,166


# Protein extraction with simpler ORFinder

In [46]:
# -- Extracting all protein sequences from DNA with simpler finder
prot_list = []
prot_dict = {}
name = filtered_df['ID']  
DNA = filtered_df['DNA_seq'] 

for i in range(len(filtered_df)):    #There's no NaN values in DNA column
  ID = name.iloc[i]
  if type(DNA.iloc[i]) == str:
    record = Seq(DNA.iloc[i])
  for strand, nuc in [(+1, record), (-1, record.reverse_complement())]:   # explore seq and rev complement
       for frame in range(3): #explore all 3 ORFs
         min_pro_len = 1000
         table = 1
         length = 3 * ((len(record) - frame) // 3)  # Multiple of three
         for pro in nuc[frame : frame + length].translate(table).split("*"):
             if len(pro) >= min_pro_len:
               #print( "%s - %s...%s - length %i," % (ID, pro[:20], pro[-3:], len(pro)))
               prot_list.append(str(pro))
               prot_dict[ID] = str(pro)

# Display output prot dict
#prot_dict

# Make a df and ID column is set as index
prot_df = pd.DataFrame.from_dict(prot_dict, orient='index') 
# Resetting index to displace ID col
proteins_df = prot_df.reset_index()

# Using drop() function to delete last row and renaming cols
proteins_df.columns = ['ID','Seq']
#final_df.drop(index=final_df.index[-1], inplace=True)
proteins_df

Unnamed: 0,ID,Seq
0,L1M5_orf2DF0000008,MVDLNPXISIITLNVNGLNTPIKRQRLSDWIKKQDPTICCLQETHF...
1,L1M1_orf2DF0000228,MAGVSPYLSIITLNVNGLNSPIKRHRVAEWMKKQDPXICCLQETHF...
2,L1M2_orf2DF0000236,MAXVSPYLSIITLNVNGLNSPIKRHRXAEWIKKQDPTICCLQETHF...
3,L1M3_orf2DF0000244,MAIVSPXLSIITLNVNGLNSPIKRHRVAEWIKKQDPTICCLQETHF...
4,L1M4_orf2DF0000250,MADVNPTXSVITLNVNGLNTPIKRQRLAEWIKKHDPTICCLQETHF...
...,...,...
716,TE3105_SO2_FAM184DF0289547,TRSKVLLQPLFPPQTYPFFLPPMHGISQINLHHAKGSSATIARLFE...
717,TE3121_SO2_FAM192DF0289563,PLXGGGTSVARGGWEIPCPPYNRGTLTYPRNLMHGISQINLHHAKG...
718,TE3146_SO2_FAM209DF0289588,RRGFLTTRMAPSQPGLRVVQINLNHCEAATEDLMLFMSEKKVDVAL...
719,TE3261_SO2_FAM454DF0289702,ISENGLPTAVQKRNGTLTVNHFPKILMADKNDRMALSPGHPLSGER...


# Exporting as fasta file from prot_dict{}

In [None]:
# output file name
output_file = 'ORF2p_v2.fasta'

# open output file for writing
with open(output_file, 'w') as out_file:
  for seq_id, seq in prot_dict.items(): # loop through dictionary items 
    seq_record = SeqIO.SeqRecord(Seq(seq), id= str(seq_id), description="") # make sure to save ID as string
    SeqIO.write(seq_record, out_file, 'fasta')

## Merging ORF2p seqs and count number

In [49]:
# -- Merging both datasets and exporting as csv

ORF2p_seqs_counts = pd.merge(proteins_df,ORF2p_counts)
ORF2p_with_taxo = pd.merge(ORF2p_seqs_counts, L1_taxo)

ORF2p_with_taxo

#output_path = "/Users/leandrojorqueravalero/Desktop/PhD/Synthetic_ORF2p"

#ORF2p_with_taxo.to_csv(r'/Users/leandrojorqueravalero/Desktop/PhD/synthetic_ORF2/data/ORF2p_taxo_v2.csv', index= False)


Unnamed: 0,ID,Seq,All,Non_redundant,Taxonomy
0,L1M5_orf2DF0000008,MVDLNPXISIITLNVNGLNTPIKRQRLSDWIKKQDPTICCLQETHF...,337941,114873,Mammalia
1,L1M1_orf2DF0000228,MAGVSPYLSIITLNVNGLNSPIKRHRVAEWMKKQDPXICCLQETHF...,277575,11129,Mammalia
2,L1M2_orf2DF0000236,MAXVSPYLSIITLNVNGLNSPIKRHRXAEWIKKQDPTICCLQETHF...,296693,17500,Mammalia
3,L1M3_orf2DF0000244,MAIVSPXLSIITLNVNGLNSPIKRHRVAEWIKKQDPTICCLQETHF...,299220,21747,Mammalia
4,L1M4_orf2DF0000250,MADVNPTXSVITLNVNGLNTPIKRQRLAEWIKKHDPTICCLQETHF...,313178,32089,Mammalia
...,...,...,...,...,...
715,TE3105_SO2_FAM184DF0289547,TRSKVLLQPLFPPQTYPFFLPPMHGISQINLHHAKGSSATIARLFE...,50037,2731,Insecta
716,TE3121_SO2_FAM192DF0289563,PLXGGGTSVARGGWEIPCPPYNRGTLTYPRNLMHGISQINLHHAKG...,50229,1546,Insecta
717,TE3146_SO2_FAM209DF0289588,RRGFLTTRMAPSQPGLRVVQINLNHCEAATEDLMLFMSEKKVDVAL...,54467,2789,Insecta
718,TE3261_SO2_FAM454DF0289702,ISENGLPTAVQKRNGTLTVNHFPKILMADKNDRMALSPGHPLSGER...,9477,796,Insecta


# Protein extraction with ORFinder

In [None]:
#record = SeqIO.read("NC_005816.fna", "fasta") #parse sequence
table = 1  #set table for aa
min_pro_len = 1000 #lower limit for protein length

def find_orfs_with_trans(seq, trans_table,min_protein_lenght):
  answer=[] #list of results
  seq_len = len(seq)
  for strand, nuc in ([+1, seq], (-1, seq.reverse_complement())):
    for frame in range(3):
      trans = nuc[frame:].translate(trans_table)
      trans_len = len(trans)
      aa_start = 0
      aa_end = 0
      while aa_start < trans_len:
        aa_end = trans.find("*",aa_start)
        if aa_end == -1:
          aa_end = trans_len
        if aa_end - aa_start >= min_protein_lenght:
          if aa_end == 1:
            start = frame + aa_start * 3
            end = min(seq_len, frame + aa_end * 3 +3)
          else:
            start = seq_len - frame - aa_end * 3 -3
            end = seq_len - frame - aa_start * 3
          answer.append ((start, end, strand, trans[aa_start:aa_end]))
        aa_start = aa_end + 1 

  answer.sort()  #sort the table for start base position
  return answer

# -- iterate over all elements in DNA_seq col
record_list=[]
for i in filtered_df['DNA_seq']:
  if type(i) == str:
    record = Seq(i)
    record_list.append(record)
    #print (record)
    orf_list = find_orfs_with_trans(record, table, min_pro_len) # -- call ORFinder
  else:
    continue
#len(record_list)
#len(orf_list)
for start, end, strand, pro in orf_list:  # -- show results 
  print ("%s ...%s - lenght %i, strand %i, %i:%i" % (pro[:20], pro[-3:], len(pro), strand, start, end))


In [None]:
# -- iterate over all elements in DNA_seq col using complex ORFinder
record_list=[]
for i in filtered_df['DNA_seq']:
  if type(i) == str:
    record = Seq(i)
    record_list.append(record)
    #print (record)
    orf_list = find_orfs_with_trans(record, table, min_pro_len) # -- call ORFinder
  else:
    continue
#len(record_list)
#len(orf_list)
for start, end, strand, pro in orf_list:  # -- show results 
  print ("%s ...%s - lenght %i, strand %i, %i:%i" % (pro[:20], pro[-3:], len(pro), strand, start, end))

# Table supplementary ORF2/hits

In [None]:
import pandas as pd
df_2 = pd.read_csv("/content/ORF2_suppl.csv", delimiter=';')
df_2.columns = ['species', 'ORF2_CN']
#print(df)
#df.head()
df_2.sort_values(by=['ORF2_CN'], ascending=False)


Unnamed: 0,species,ORF2_CN
0,Monodelphis domestica,1607
44,Balaenoptera\nacutorostrata scammoni,1338
84,Rhinopithecus roxellana,714
75,Mus musculus,629
350,Brassica napus,544
...,...,...
262,Schistosoma curassoni,0
263,Schistosoma haematobium,0
264,Schistosoma japonicum,0
265,Schistosoma mansoni,0
