## Importing Protein Fasta

In [69]:
from Bio import SeqIO

In [100]:
def read_protein_fasta(fasta):
    """
    Function that imports a protein fasta file into a dictionary
    """
    with open(fasta, "r") as fasta_file:
        fasta_dict = {}
        for record in SeqIO.parse(fasta_file, "fasta"):
            fasta_dict[record.id] = str(record.seq)  # Convert sequence to a string if needed
    return fasta, fasta_dict

In [110]:
protein_import = read_protein_fasta("./test_data/GCA_009697285_1.faa")
protein_dict = protein_import[1]
output_folder = './Output/'

## AA Frequencies

In [116]:
from itertools import product
from Bio import SeqUtils
import pandas as pd

In [117]:
def space_seperated_record(fasta_dict):
    return ' '.join(str(value) for value in fasta_dict.values())

In [118]:
def generate_combinations(nucleotides, X):
    return [''.join(seq) for seq in product(nucleotides, repeat=X)]

In [119]:
def get_aminoacids():
    aa_list = []
    for aa in SeqUtils.IUPACData.protein_letters:
        aa_list.append(aa)
    return aa_list

In [120]:
def count_aminoacids(fasta_dict):
    
    seq = space_seperated_record(fasta_dict)
    
    sequence = seq.upper()

    aa_list = get_aminoacids()

    aa_counts = {}

    for aa in aa_list:
        aa_counts[aa] = sequence.count(aa)
        
    return aa_counts

In [121]:
def count_diaminoacids(fasta_dict):
    
    seq = space_seperated_record(fasta_dict)
    
    sequence = seq.upper()
    
    aa_list = get_aminoacids()
    
    di_aas = generate_combinations(aa_list, 2)
    
    di_aa_counts = {}

    for di_aa in di_aas:
        di_aa_counts[di_aa] = sequence.count(di_aa)
        
    return di_aa_counts

In [122]:
def count_triaminoacids(fasta_dict):
    
    seq = space_seperated_record(fasta_dict)
    
    sequence = seq.upper()
    
    aa_list = get_aminoacids()
    
    tri_aas = generate_combinations(aa_list, 3)
    
    tri_aa_counts = {}

    for tri_aa in tri_aas:
        tri_aa_counts[tri_aa] = sequence.count(tri_aa)
        
    return tri_aa_counts

In [123]:
def create_dir(dir_path):
    """Create a directory if it does not exist."""
    try:
        os.makedirs(dir_path, exist_ok=True)  # exist_ok=True avoids raising an error if the directory already exists
        print(f"Directory '{dir_path}' is created or already exists.")
    except Exception as e:
        print(f"Error creating directory: {e}")

In [124]:
def merge_dicts(dict1, dict2, dict3, dict4):
    merged_dict = {**dict1, **dict2, **dict3, **dict4}
    return merged_dict

In [125]:
def create_db(protein_import, output_folder):
    input_file = protein_import[0]
    seq_dict = protein_import[1]

    input_data = {}
    input_data['file'] = protein_import[0]
    aa_counts = count_aminoacids(seq_dict)
    di_aa_counts = count_diaminoacids(seq_dict)
    tri_aa_counts = count_triaminoacids(seq_dict)

    data = merge_dicts(input_data, aa_counts, di_aa_counts, tri_aa_counts)

    aa_freq_df = pd.DataFrame(data, index=[0])

    file_prefix = os.path.splitext(os.path.basename(input_file))[0]

    filename = file_prefix + '.aa_freq.csv'

    output_path = os.path.join(output_folder, filename)

    create_dir(output_folder)

    aa_freq_df.to_csv(output_path, index=False)

    return aa_freq_df

In [126]:
aa_df = create_db(protein_import, output_folder)

Directory './Output/' is created or already exists.


## Protein Gravy Weight

In [72]:
from Bio import SeqUtils
import os
import shutil
from libsvm import svmutil
import io
import contextlib

In [73]:
def extractor(pro_dict):
    
    k_dict = {}
    
    for id,seq in pro_dict.items():
    
        k_count = 0
        len_seq = len(seq)
        arr_seqin = list(seq)
        
        # Count 'K' occurrences
        for i in range(len_seq):
            if arr_seqin[i] == 'K':
                k_count += 1
        
        # Store indices of 'K'
        index_list = [i for i in range(len_seq) if arr_seqin[i] == 'K']
        
        k_dict[id] = index_list
    
    return k_dict

In [74]:
k_indicies = extractor(protein_dict)

In [75]:
def fragment_extractor(k_indicies, protein_dict):
    
    protein_frags = {}
    
    for id,idx_list in k_indicies.items():
    
        fragments = []
        
        for idx in idx_list:
            if (idx - 7) < 0:
                frag = protein_dict[id][0:idx + 8]
            elif (idx + 8) > len(protein_dict[id]):
                frag = protein_dict[id][idx - 7: len(protein_dict[id])]
            else:
                frag = protein_dict[id][idx - 7: idx + 8]
            fragments.append(frag)
        
        protein_frags[id] = fragments
    
    return protein_frags

In [76]:
protein_frags = fragment_extractor(k_indicies, protein_dict)

In [77]:
def create_dir(dir_path, mode='l'):
    """Create a directory if it does not exist."""
    try:
        os.makedirs(dir_path, exist_ok=True)  # exist_ok=True avoids raising an error if the directory already exists
        if mode == 'l':
            print(f"Directory '{dir_path}' is created or already exists.")
    except Exception as e:
        print(f"Error creating directory: {e}")

In [78]:
def delete_folder(folder_path):
    try:
        # Check if folder exists
        if os.path.exists(folder_path):
            # Delete the folder and all its contents
            shutil.rmtree(folder_path)
            print(f"Folder '{folder_path}' and its contents have been deleted successfully.")
        else:
            raise FileNotFoundError(f"The folder '{folder_path}' does not exist.")
    except Exception as e:
        # Raise an error if deletion fails
        raise Exception(f"Failed to delete folder '{folder_path}': {e}")


In [79]:
def features(protein_frags):
    
    delete_folder('./tmp')
    create_dir('./tmp', 'q')
    
    smv_outputs = {}

    for id,frags in protein_frags.items():
        
        temp_dir = os.path.join('./tmp', str(id))
        create_dir(temp_dir, 'q')
        temp_feature_file = os.path.join(temp_dir, str(id)+'.svm')
        
        for frag in frags:

            # Amino acid counts
            counts = {
                'arg': 0, 'his': 0, 'lys': 0, 'asp': 0, 'glu': 0,
                'asn': 0, 'cys': 0, 'gln': 0, 'gly': 0, 'ser': 0,
                'thr': 0, 'tyr': 0, 'ala': 0, 'ile': 0, 'leu': 0,
                'met': 0, 'phe': 0, 'prol': 0, 'trp': 0, 'val': 0
            }
            for ch in frag:
                ch = ch.upper()
                three_letter = SeqUtils.IUPACData.protein_letters_1to3[ch].lower()
                if three_letter in counts:
                    counts[three_letter] += 1
            
            # Total amino acids
            total = sum(counts.values())
            
            # Percent calculations
            percents = {key: (value * 100) / total for key, value in counts.items()}
            
            # Gravy calculation
            gravy_weights = {
                'ala': 1.80, 'arg': -4.50, 'asn': -3.50, 'asp': -3.50, 'cys': 2.50,
                'gln': -3.50, 'glu': -3.50, 'gly': -0.40, 'his': -3.20, 'ile': 4.50,
                'leu': 3.80, 'lys': -3.90, 'met': 1.90, 'phe': 2.80, 'prol': -1.60,
                'ser': -0.80, 'thr': -0.70, 'trp': -0.90, 'tyr': -1.30, 'val': 4.20
            }
            gravy = sum(counts[aa] * gravy_weights[aa] for aa in counts) / total
            
            try:
                with open(temp_feature_file, 'a') as outs:
                    features = "\t".join([f"{i + 1}:{round(percents[aa] * 100) / 100.0}" 
                                          for i, aa in enumerate(counts)])
                    outs.write(f"5\t{features}\t21:{round(gravy * 100) / 100.0}\n")
                
                smv_outputs[id] = temp_feature_file
            
            except IOError as e:
                print(f"Error: {e}")
    return smv_outputs  

In [80]:
smv_outputs = features(protein_frags)

Folder './tmp' and its contents have been deleted successfully.


In [85]:
#import subprocess
#command = ['./svm_model/svm-predict', './tmp/PAANNGHF_00001.svm', './svm_model/Model', 'svm_output.txt']

# Run the command
#try:
#    subprocess.run(command, check=True)
#    print("svm-predict ran successfully.")
#except subprocess.CalledProcessError as e:
#    print(f"Error occurred: {e}")

In [83]:
def load_svm_model(model_path):
    model = svmutil.svm_load_model(model_path)
    return model

In [84]:
def run_svm_model(model, svm_paths):
    
    svm_results = {}
    
    for protein, svm_input in svm_paths.items():
        y, X = svmutil.svm_read_problem(svm_input)
        
        # Use a StringIO to capture output and suppress it
        with io.StringIO() as buf, contextlib.redirect_stdout(buf):
            # Perform prediction without printing to stdout
            p_labels, p_acc, p_vals = svmutil.svm_predict([], X, model)  # Empty list for true labels
        
        svm_results[protein] = (p_labels, p_acc, p_vals)
        
    return svm_results

In [68]:
model = load_svm_model('./svm_model/Model')
svm_results = run_svm_model(model, smv_outputs)