<a href="https://colab.research.google.com/github/LeandroJorquera/synthetic_L1/blob/main/ORF2p-retrieval_v3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%capture
!pip install SeqIO
!pip install Bio

In [None]:
import pandas as pd
from Bio import SeqIO
from Bio.Seq import Seq

# Reading and sorting dataset

In [None]:
# -- Read dataset (locally) 
path = "/Users/leandrojorqueravalero/Desktop/PhD/Synthetic_ORF2p/preprocessed_LINE_v2.csv"
df=pd.read_csv(path, low_memory = False)  
len(df.index)


100000

In [None]:
# -- Select desired columns and rename

sorted_df = df.iloc[:,[19, 3,8, 18]]   # 3: length  // 8:DNA_seq // 18: counts info // 19:ID + Dfam entry
sorted_df.columns = ['ID','Length','DNA_seq', 'Counts']#[,'Length','DNA_seq',]
#sorted_df.head(5)

# -- Select columns matching required criteria (length)
filtered_df = sorted_df.loc[sorted_df['Length']>3000]
#len(filtered_df.index)

# -- removing duplicates
filtered_df.drop_duplicates(subset=['ID'], keep=False)
len(filtered_df.index)

26431

# Retrieving count number (1357 out of 26431 are valid)

In [None]:
# select desired columns and create empty output dict
name = filtered_df['ID']  
counts = filtered_df['Counts']
output_dict = {}

# -- testing the number of column with NaN values (only 1357 out of 26431 have valid values)
#nonNA_counts = filtered_df['Counts'].dropna()
#len(nonNA_counts)

# iterating over dataset
for i in range(len(filtered_df)):
    ID = name.iloc[i]
    if type(counts.iloc[i]) == str: # must be string to be converted to dict
      hmm_dict = eval(counts.iloc[i])
      count_list = list(hmm_dict.values())  
      count_dict = eval(str(count_list[0])) 
      all = count_dict['gathering_all']
      non_redun = count_dict['gathering_nonredundant']
      #print(ID , all, non_redun)
      output_dict[ID] = (all,non_redun) # associate both number to each ID

# Display output dict
#output_dict

# Make a df and ID column is set as index
count_df = pd.DataFrame.from_dict(output_dict, orient='index') 
#count_df

# Resetting index to displace ID col
ORF2p_counts = count_df.reset_index()

# Renaming columns
ORF2p_counts.columns = ['ID','All', 'Non_redundant']
ORF2p_counts

# Protein extraction with simpler ORFinder

In [None]:
# -- Extracting all protein sequences from DNA with simpler finder
prot_list = []
prot_dict = {}
name = filtered_df['ID']  
DNA = filtered_df['DNA_seq'] 

for i in range(len(filtered_df)):    #There's no NaN values in DNA column
  ID = name.iloc[i]
  if type(DNA.iloc[i]) == str:
    record = Seq(DNA.iloc[i])
  for strand, nuc in [(+1, record), (-1, record.reverse_complement())]:   # explore seq and rev complement
       for frame in range(3): #explore all 3 ORFs
         min_pro_len = 1000
         table = 1
         length = 3 * ((len(record) - frame) // 3)  # Multiple of three
         for pro in nuc[frame : frame + length].translate(table).split("*"):
             if len(pro) >= min_pro_len:
               #print( "%s - %s...%s - length %i," % (ID, pro[:20], pro[-3:], len(pro)))
               prot_list.append(str(pro))
               prot_dict[ID] = str(pro)

# Display output prot dict
#prot_dict

# Make a df and ID column is set as index
prot_df = pd.DataFrame.from_dict(prot_dict, orient='index') 
# Resetting index to displace ID col
proteins_df = prot_df.reset_index()

# Using drop() function to delete last row and renaming cols
proteins_df.columns = ['ID','Seq']
#final_df.drop(index=final_df.index[-1], inplace=True)
proteins_df

In [None]:
proteins_df.columns = ['ID','Seq']
proteins_df

# Exporting as fasta file from prot_dict{}

In [None]:
# output file name
output_file = 'ORF2p_v2.fasta'

# open output file for writing
with open(output_file, 'w') as out_file:
  for seq_id, seq in prot_dict.items(): # loop through dictionary items 
    seq_record = SeqIO.SeqRecord(Seq(seq), id= str(seq_id), description="") # make sure to save ID as string
    SeqIO.write(seq_record, out_file, 'fasta')

## Merging ORF2p seqs and count number

In [None]:
# -- Merging both datasets and exporting as csv

ORF2p_seqs_counts = pd.merge(proteins_df,ORF2p_counts)
#ORF2p_seqs_counts

#output_path = "/Users/leandrojorqueravalero/Desktop/PhD/Synthetic_ORF2p"

ORF2p_seqs_counts.to_csv(r'/Users/leandrojorqueravalero/Desktop/PhD/Synthetic_ORF2p/ORF2p_seqs_counts.csv', index= False)

# Protein extraction with ORFinder

In [None]:
#record = SeqIO.read("NC_005816.fna", "fasta") #parse sequence
table = 1  #set table for aa
min_pro_len = 1000 #lower limit for protein length

def find_orfs_with_trans(seq, trans_table,min_protein_lenght):
  answer=[] #list of results
  seq_len = len(seq)
  for strand, nuc in ([+1, seq], (-1, seq.reverse_complement())):
    for frame in range(3):
      trans = nuc[frame:].translate(trans_table)
      trans_len = len(trans)
      aa_start = 0
      aa_end = 0
      while aa_start < trans_len:
        aa_end = trans.find("*",aa_start)
        if aa_end == -1:
          aa_end = trans_len
        if aa_end - aa_start >= min_protein_lenght:
          if aa_end == 1:
            start = frame + aa_start * 3
            end = min(seq_len, frame + aa_end * 3 +3)
          else:
            start = seq_len - frame - aa_end * 3 -3
            end = seq_len - frame - aa_start * 3
          answer.append ((start, end, strand, trans[aa_start:aa_end]))
        aa_start = aa_end + 1 

  answer.sort()  #sort the table for start base position
  return answer

# -- iterate over all elements in DNA_seq col
record_list=[]
for i in filtered_df['DNA_seq']:
  if type(i) == str:
    record = Seq(i)
    record_list.append(record)
    #print (record)
    orf_list = find_orfs_with_trans(record, table, min_pro_len) # -- call ORFinder
  else:
    continue
#len(record_list)
#len(orf_list)
for start, end, strand, pro in orf_list:  # -- show results 
  print ("%s ...%s - lenght %i, strand %i, %i:%i" % (pro[:20], pro[-3:], len(pro), strand, start, end))


In [None]:
# -- iterate over all elements in DNA_seq col using complex ORFinder
record_list=[]
for i in filtered_df['DNA_seq']:
  if type(i) == str:
    record = Seq(i)
    record_list.append(record)
    #print (record)
    orf_list = find_orfs_with_trans(record, table, min_pro_len) # -- call ORFinder
  else:
    continue
#len(record_list)
#len(orf_list)
for start, end, strand, pro in orf_list:  # -- show results 
  print ("%s ...%s - lenght %i, strand %i, %i:%i" % (pro[:20], pro[-3:], len(pro), strand, start, end))

# Table supplementary ORF2/hits

In [None]:
import pandas as pd
df_2 = pd.read_csv("/content/ORF2_suppl.csv", delimiter=';')
df_2.columns = ['species', 'ORF2_CN']
#print(df)
#df.head()
df_2.sort_values(by=['ORF2_CN'], ascending=False)


Unnamed: 0,species,ORF2_CN
0,Monodelphis domestica,1607
44,Balaenoptera\nacutorostrata scammoni,1338
84,Rhinopithecus roxellana,714
75,Mus musculus,629
350,Brassica napus,544
...,...,...
262,Schistosoma curassoni,0
263,Schistosoma haematobium,0
264,Schistosoma japonicum,0
265,Schistosoma mansoni,0


# Merge datasets

In [None]:
# -- merging datasets according to species name
merged_df = pd.merge(df_2,sorted_df)
#len(merged_df)
#merged_df.head(10)

# -- Dropping columns not matching criteria (length)
valid_orf2 = merged_df.loc[(merged_df['length']>=3000) & (merged_df['length']<4000)]
valid_orf2
len(valid_orf2)

#merged_unique = merged_df.drop_duplicates(subset=['nickname'], keep=False)
#print(len(merged_unique))
#merged_unique


686