In [None]:
%%capture
!pip install SeqIO
!pip install Bio

In [None]:
import pandas as pd
from Bio import SeqIO
from Bio.Seq import Seq

In [None]:
def read_dataset(input_path, min_length = 3000):
    """Reads Dfam dataset from CSV file, select columns and filter by DNA_seq length"""
    df = pd.read_csv(input_path, low_memory=False)
    sorted_df = df.iloc[:,[19, 3, 8, 18, 16]]
    sorted_df.columns = ['ID','Length','DNA_seq', 'Counts', 'Species']
    pd.options.mode.chained_assignment = None
    filtered_df = sorted_df.loc[sorted_df['Length'] > min_length]

    return df, filtered_df

def get_taxa(filtered_df, eukarya_classes):
    """Retrieves the Eukarya class names from the Species column in the filtered dataframe"""
    taxonomy = filtered_df['Species']
    name = filtered_df['ID'] 
    taxo_dict = {}
    for i in range(len(filtered_df)):
        ID = name.iloc[i]
        spec = taxonomy.iloc[i]
        parts = spec.split(";")
        for word in parts:
            if word in eukarya_classes:
                taxo_dict[ID] = word
    # Make a dataframe and set ID column as index
    taxo_info = pd.DataFrame.from_dict(taxo_dict, orient='index') 
    L1_taxo = taxo_info.reset_index() 
    L1_taxo.columns = ['ID', 'Taxonomy']
    mammals_taxo = L1_taxo.loc[L1_taxo['Taxonomy'] == 'Mammalia']  # Select only mammals
    return L1_taxo, mammals_taxo

def get_count_number(filtered_df):
    """Retrieves count numbers from the Counts column in the filtered dataframe"""
    name = filtered_df['ID']  
    counts = filtered_df['Counts']
    output_dict = {}
    for i in range(len(filtered_df)):
        ID = name.iloc[i]
        if type(counts.iloc[i]) == str:
            hmm_dict = eval(counts.iloc[i])
            count_list = list(hmm_dict.values())  
            count_dict = eval(str(count_list[0])) 
            all_count = count_dict['gathering_all']
            non_redun = count_dict['gathering_nonredundant']
            output_dict[ID] = (all_count,non_redun)
    # Make a dataframe and set ID column as index
    count_df = pd.DataFrame.from_dict(output_dict, orient='index')
    L1_counts = count_df.reset_index()
    # Rename cols
    L1_counts.columns = ['ID','All', 'Non_redundant']
    return L1_counts

def extract_protein_sequences(filtered_df):
    """Look for ORFs in all the possible frames (6)"""
    prot_dict = {}
    name = filtered_df['ID']
    DNA = filtered_df['DNA_seq']

    for i in range(len(filtered_df)):
        ID = name.iloc[i]
        if type(DNA.iloc[i]) == str:
            record = Seq(DNA.iloc[i])
            for strand, nuc in [(+1, record), (-1, record.reverse_complement())]:
                for frame in range(3):
                    min_pro_len = 1000
                    table = 1
                    length = 3 * ((len(record) - frame) // 3)
                    for pro in nuc[frame : frame + length].translate(table).split("*"):
                        if len(pro) >= min_pro_len:
                            prot_dict[ID] = str(pro)
    # Generating prot_df from prot_dict
    prot_df = pd.DataFrame.from_dict(prot_dict, orient='index')
    proteins_df = prot_df.reset_index()
    proteins_df.columns = ['ID','prot_seq']
    
        
    # Write protein dictionary into fasta format including ID as header (no description) in working directory
    output_file = 'ORF2p_newfinder.fasta'  # Output filename
    
    with open(output_file, 'w') as out_file:
        for seq_id, seq in prot_dict.items(): # loop through dictionary items 
            seq_record = SeqIO.SeqRecord(Seq(seq), id= str(seq_id), description="") # make sure to save ID as string
            SeqIO.write(seq_record, out_file, 'fasta')
    
    return prot_dict, proteins_df  # return both dict (from exporting as fasta) and df for merging

def merge_df(proteins_df, ORF2p_counts):
    """ Merging prots df and counts df"""
    ORF2_with_counts = pd.merge(proteins_df,ORF2p_counts)
    ORF2p_taxo = pd.merge(ORF2_with_counts, L1_taxo)
    ORF2p_mammals = pd.merge(ORF2_with_counts, mammals_taxo)
    

    return ORF2_with_counts, ORF2p_taxo, ORF2p_mammals


**ORFinder that look for M as the first residue (render less prots)**

In [None]:
def get_prot_orfs(filtered_df):
    """Look for ORFs in all the possible frames (6)"""
    prot_dict = {}
    min_pro_len = 1000  # Set the minimum length of a protein sequence

    for i, row in filtered_df.iterrows():
        ID = row['ID']
        DNA = row['DNA_seq']
        if isinstance(DNA, str):
            record = Seq(DNA) # Generate Seq object from string DNA seq
            # Iterate through both strands of the DNA sequence (forward and reverse complement)
            for strand, nuc in [(+1, record), (-1, record.reverse_complement())]: 
                # Explore all 6 possible ORFs
                for frame in range(3):
                    length = 3 * ((len(record) - frame) // 3)
                    orfs = [str(pro) for pro in nuc[frame : frame + length].translate(table=1).split("*") if str(pro).startswith('M')]
                    if orfs:
                        seq_final = max(orfs, key=len)
                        if len(seq_final) >= min_pro_len:
                            prot_dict[ID] = seq_final
                            break
                            
    # Generating prot_df from prot_dict
    prot_df = pd.DataFrame.from_dict(prot_dict, orient='index')
    proteins_df = prot_df.reset_index()
    proteins_df.columns = ['ID','prot_seq']
    
    # Write protein dictionary into fasta format including ID as header (no description) in working directory
    output_file = 'ORF2p_newfinder.fasta'
    
    with open(output_file, 'w') as out_file:
        for seq_id, seq in prot_dict.items(): # loop through dictionary items 
            seq_record = SeqIO.SeqRecord(Seq(seq), id= str(seq_id), description="") # make sure to save ID as string
            SeqIO.write(seq_record, out_file, 'fasta')

                            
    return proteins_df

In [None]:
input_path = "/Users/leandro/Desktop/ai_data/data/preprocessed_LINE_v2.csv"
df = read_dataset(input_path)[0]
filtered_df = read_dataset(input_path)[1]

In [None]:
#len(df.index) # -- Check df is read completely

In [None]:
# -- Select columns of interest
sorted_df = sort_and_select_columns(df)

In [None]:
#len(filtered_df.index) # -- Check length

In [None]:
# -- Define main eukarya classes (list from ChatGPT) extract from each entry based on matching to list
eukarya_classes = ['Mammalia', 'Aves', 'Reptilia', 'Actinopterygii', 'Amphibia', 'Insecta', 'Fungi', 'Plantae']
L1_taxo = get_taxa(filtered_df, eukarya_classes)[0]
mammals_taxo = get_taxa(filtered_df, eukarya_classes)[1]

In [None]:
len(L1_taxo.index)  # -- Only 1342 entries out of 26k have Taxonomy label

In [None]:
len(mammals_taxo.index)

In [None]:
L1_counts = get_count_number(filtered_df)

In [None]:
len(L1_counts.index) # -- Only 1357 entries out of 26k have counts info

In [None]:
extract_orf = extract_protein_sequences(filtered_df)

In [None]:
#prot_dict
#proteins_df

In [None]:
def get_prot_orfs(filtered_df):
    """Look for ORFs in all the possible frames (6)"""
    prot_dict = {}
    min_pro_len = 1000  # Set the minimum length of a protein sequence

    for i, row in filtered_df.iterrows():
        ID = row['ID']
        DNA = row['DNA_seq']
        if isinstance(DNA, str):
            record = Seq(DNA) # Generate Seq object from string DNA seq
            # Iterate through both strands of the DNA sequence (forward and reverse complement)
            for strand, nuc in [(+1, record), (-1, record.reverse_complement())]: 
                # Explore all 6 possible ORFs
                for frame in range(3):
                    length = 3 * ((len(record) - frame) // 3)
                    orfs = [str(pro) for pro in nuc[frame : frame + length].translate(table=1).split("*") if str(pro).startswith('M')]
                    if orfs:
                        seq_final = max(orfs, key=len)
                        if len(seq_final) >= min_pro_len:
                            prot_dict[ID] = seq_final
                            break
                            
    # Generating prot_df from prot_dict
    prot_df = pd.DataFrame.from_dict(prot_dict, orient='index')
    proteins_df = prot_df.reset_index()
    proteins_df.columns = ['ID','prot_seq']
    
    # Write protein dictionary into fasta format including ID as header (no description) in working directory
    output_file = 'ORF2p_newfinder.fasta'
    
    with open(output_file, 'w') as out_file:
        for seq_id, seq in prot_dict.items(): # loop through dictionary items 
            seq_record = SeqIO.SeqRecord(Seq(seq), id= str(seq_id), description="") # make sure to save ID as string
            SeqIO.write(seq_record, out_file, 'fasta')

                            
    return prot_dict, proteins_df

In [None]:
import time
start_time = time.time()
main()
print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
prot_df = get_prot_orfs(filtered_df)

In [None]:
len(prot_df.index)