# Reading Files

In [1]:
from Bio import SeqIO
import pandas as pd
n_clusters   = 200
all_datasets = {}
row          = 0
df           = pd.DataFrame(columns = ['sequence', 'cluster'])
for cluster in range(1, n_clusters+1):
    all_datasets[cluster] = []
    # File path to your FASTA file
    path_to_file          = f'datasets/template_{cluster}.fa' # <--- substitute by your local path
    # Open file with "with" statement to avoid problems with access 
    # to original file (in case computer hangs
    # or there will be any other problem)
    with open(path_to_file, mode='r') as handle:
        # Use Biopython's parse function to process individual
        # FASTA records (thus reducing memory footprint)
        for record in SeqIO.parse(handle, 'fasta'):
            # Extract individual parts of the FASTA record
            identifier  = record.id
            description = record.description
            sequence    = record.seq
            #append to the datest
            all_datasets[cluster].append(str(sequence).upper())
            df.loc[row, 'sequence'] = str(sequence).upper()
            df.loc[row, 'cluster']  = cluster-1
            row                    += 1

# Running the Model for both RF and DT

In [2]:
import nTreeClus
model = nTreeClus.nTreeClus(list(df.sequence), n=None, ntree=5, method="All", verbose=1, C=n_clusters)
model.nTreeClus()

Matrix Segmentation (Splitting based on window size):   2%|▍                     | 288/14993 [00:00<00:06, 2255.51it/s]

Finding the parameter 'n'
Parameter 'n' is set to 18


Matrix Segmentation (Splitting based on window size): 100%|████████████████████| 14993/14993 [00:05<00:00, 2660.34it/s]


one-hot encoding + x/y train
Fit DT
DataFrame of terminal nodes
Determining the cosine Distance
Applying Ward Linkage
Cutting The Tree
Fit RF
DataFrame of terminal nodes
Determining the cosine Distance
Applying Ward Linkage
Cutting The Tree


# Performance Report

In [4]:
labels = list(df.cluster)
model.performance(Ground_Truth = labels)

Unnamed: 0,F1S,ARS,RS,Pur,Sil
DT,1.0,1.0,1.0,1.0,0.921
RF,1.0,1.0,1.0,1.0,0.93
