In [None]:
%%capture
!pip install SeqIO
!pip install Bio

In [None]:
import pandas as pd
from Bio import SeqIO
from Bio.Seq import Seq

In [None]:
def read_dataset(input_path):
    """Reads Dfam dataset from a CSV file"""
    df = pd.read_csv(input_path, low_memory=False)
    return df

def sort_and_select_columns(df):
    """Sorts the input dataframe and rename desired columns"""
    sorted_df = df.iloc[:,[19, 3, 8, 18, 16]]
    sorted_df.columns = ['ID','Length','DNA_seq', 'Counts', 'Species']
    return sorted_df

def filter_by_length(df, min_length=3000):
    """Filters out rows with Length less than the given min_length"""
    pd.options.mode.chained_assignment = None
    filtered_df = df.loc[df['Length'] > min_length]
    return filtered_df

def retrieve_eukarya_class(filtered_df, eukarya_classes):
    """Retrieves the Eukarya class names from the Species column in the filtered dataframe"""
    taxonomy = filtered_df['Species']
    name = filtered_df['ID'] 
    taxo_dict = {}
    for i in range(len(filtered_df)):
        ID = name.iloc[i]
        spec = taxonomy.iloc[i]
        parts = spec.split(";")
        for word in parts:
            if word in eukarya_classes:
                taxo_dict[ID] = word
    # Make a dataframe and set ID column as index
    taxo_info = pd.DataFrame.from_dict(taxo_dict, orient='index') 
    L1_taxo = taxo_info.reset_index() 
    L1_taxo.columns = ['ID', 'Taxonomy']
    return L1_taxo

def retrieve_count_number(filtered_df):
    """Retrieves count numbers from the Counts column in the filtered dataframe"""
    name = filtered_df['ID']  
    counts = filtered_df['Counts']
    output_dict = {}
    for i in range(len(filtered_df)):
        ID = name.iloc[i]
        if type(counts.iloc[i]) == str:
            hmm_dict = eval(counts.iloc[i])
            count_list = list(hmm_dict.values())  
            count_dict = eval(str(count_list[0])) 
            all_count = count_dict['gathering_all']
            non_redun = count_dict['gathering_nonredundant']
            output_dict[ID] = (all_count,non_redun)
    # Make a dataframe and set ID column as index
    count_df = pd.DataFrame.from_dict(output_dict, orient='index').reset_index() 
    # Rename cols
    ORF2p_counts.columns = ['ID','All', 'Non_redundant']
    return ORF2p_counts

def extract_protein_sequences(filtered_df):
    """Look for ORFs in all the possible frames (6)"""
    prot_dict = {}
    name = filtered_df['ID']
    DNA = filtered_df['DNA_seq']

    for i in range(len(filtered_df)):
        ID = name.iloc[i]
        if type(DNA.iloc[i]) == str:
            record = Seq(DNA.iloc[i])
            for strand, nuc in [(+1, record), (-1, record.reverse_complement())]:
                for frame in range(3):
                    min_pro_len = 1000
                    table = 1
                    length = 3 * ((len(record) - frame) // 3)
                    for pro in nuc[frame : frame + length].translate(table).split("*"):
                        if len(pro) >= min_pro_len:
                            prot_dict[ID] = str(pro)
    
    proteins_df = pd.DataFrame.from_dict(prot_dict, orient='index', columns=['protein_sequence'])
    proteins_df.index.name = 'ID'
    
    return proteins_df

def write_fasta(output_file, prot_dict):
    """Write protein dictionary into fasta format including ID as header (no description)"""
    with open(output_file, 'w') as out_file:
        for seq_id, seq in prot_dict.items(): # loop through dictionary items 
            seq_record = SeqIO.SeqRecord(Seq(seq), id= str(seq_id), description="") # make sure to save ID as string
            SeqIO.write(seq_record, out_file, 'fasta')

def merge_counts(proteins_df, ORF2p_counts):
    """ Merging prots df and counts df"""
    ORF2p_seqs_counts = pd.merge(proteins_df,ORF2p_counts)
    return ORF2p_seqs_counts

def merge_taxa(ORF2p_seqs_counts, L1_taxo):
    """Merging seqs_counts df with taxa info"""
    ORF2p_with_taxo = pd.merge(ORF2p_seqs_counts, L1_taxo)
    return ORF2p_with_taxo

def export_to_csv(df, out_path):
    """Export df to local directorty // Include file name at the end the path"""
    df.to_csv(out_path, index=False)

In [None]:
input_path = "/Users/leandrojorqueravalero/Desktop/PhD/synthetic_ORF2/data/preprocessed_LINE_v2.csv"
df = read_dataset(input_path)
#len(df.index) # -- Check df is read completely

In [None]:
# -- Select columns of interest
sorted_df = sort_and_select_columns(df)
#len(sorted_df.index) # -- Check length

In [None]:
# -- Filtering according to DNA_seq length to extract only ORF2 or full L1
filtered_df = filter_by_length(sorted_df, min_length=3000)
#len(filtered_df.index)   # -- Show number of entries that remains after filtering

In [None]:
# -- Define main eukarya classes (list from ChatGPT) extract from each entry based on matching to list
eukarya_classes = ['Mammalia', 'Aves', 'Reptilia', 'Actinopterygii', 'Amphibia', 'Insecta', 'Fungi', 'Plantae']
L1_taxo = retrieve_eukarya_class(filtered_df, eukarya_classes)
#len(L1_taxo.index)  # -- Only 1342 entries out of 26k have Taxonomy label

In [None]:
ORF2p_counts = retrieve_count_number(filtered_df)
#len(ORF2p_counts.index) # -- Only 1357 entries out of 26k have counts info

In [None]:
proteins_df = extract_protein_sequences(filtered_df)
len(proteins_df.index)