In [None]:
!pip install biopython tqdm-joblib

In [None]:
import torch
from sequence_generator import generate_and_filter_sequences, convert_dna_fasta_to_protein
from models import Generator_lang

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

n_chars = 5                                                                                 
seq_len = 156
batch_size = 64
hidden_g = 192

# Initialize models
generator = Generator_lang(n_chars, seq_len, batch_size, hidden_g).to(device)

saved_model_path = r"filepath" # add your best epoch from every evalutation metric you want from saved_model folder

checkpoint = torch.load(saved_model_path, map_location=device) 

generator.load_state_dict(checkpoint['model_state_dict'])  
generator.eval()

sequences, analysis, dna_file = generate_and_filter_sequences(
        generator=generator,
        num_samples=11270-8*640, # customize it so the generetor will create near to 5000 peptides
        count_atg=False, # if you want to keep only the peptides that starts with methionine
        add_atg=False # if you want your peptides to start with methionine
   )

# Generate sequences and convert to proteins in one go
num_proteins, protein_file = convert_dna_fasta_to_protein(input_fasta=dna_file, add_atg=False)

In [None]:
from analyze_amp_probabilities import calculate_averages

"""
Add in campr4 tool the valid_sequences_proteins.fasta from 
campr4 folder and download from campr4 the 3 files (rf, svm, ann)
"""

rf_file_path = "rf_file_path"
svm_file_path = "svm_file_path"
ann_file_path = "ann_file_path"

# Actual usage example
files = [
    rf_file_path,
    svm_file_path,
    ann_file_path
]

model_types = [
"Random Forest",
"SVM",
"ANN"
]

calculate_averages(files, model_types)

In [None]:
from Similarity import calculate_protein_similarity

protein_file = "valid_sequences_proteins.fasta" # add your path

similarity = calculate_protein_similarity(protein_file)

In [None]:
from Similarity_fast import calculate_protein_similarity_parallel

"""
If you want to run parallel and quicker
"""

protein_file = "valid_sequences_proteins.fasta" # add your path

similarity = calculate_protein_similarity_parallel(protein_file)

In [None]:
from Similarity import calculate_similarity_between_dna

"""
Similarity between original and generated peptides
"""

original_dataset_path = r"all_amp.fasta" # add your path
valid_sequences_path = r"valid_sequences.fasta" # add your path

similarity = calculate_similarity_between_dna(original_dataset_path, valid_sequences_path)