# Reading Files

In [1]:
from Bio import SeqIO
import pandas as pd
n_clusters   = 200
all_datasets = {}
row          = 0
df           = pd.DataFrame(columns = ['sequence', 'cluster'])
for cluster in range(1, n_clusters+1):
    all_datasets[cluster] = []
    # File path to your FASTA file
    path_to_file          = f'datasets/template_{cluster}.fa' # <--- substitute by your local path
    # Open file with "with" statement to avoid problems with access 
    # to original file (in case computer hangs
    # or there will be any other problem)
    with open(path_to_file, mode='r') as handle:
        # Use Biopython's parse function to process individual
        # FASTA records (thus reducing memory footprint)
        for record in SeqIO.parse(handle, 'fasta'):
            # Extract individual parts of the FASTA record
            identifier  = record.id
            description = record.description
            sequence    = record.seq
            #append to the datest
            all_datasets[cluster].append(str(sequence).upper())
            df.loc[row, 'sequence'] = str(sequence).upper()
            df.loc[row, 'cluster']  = cluster-1
            row                    += 1
df.head()

Unnamed: 0,sequence,cluster
0,TCTATCTGGGGAACACTACTCCCTGAACCGAGCGGTCAGATATTTG...,0
1,TCTATCTGGGGAACACTACTGCCCTGAACCAGAGCGGCAGATATTT...,0
2,TCTATCTGGGGAACATTACTCCCGTGAACCGAGCGGTCAGTATTTG...,0
3,TCTTCTGGGGAACACTACCTCCCTGAACCGAGCCGGTCAGATATTT...,0
4,TCTATCTGGGGAACACTACTCCCTGAACCGAGCGGTCAGATATTTG...,0


# Running the Model for both RF and DT

In [2]:
import nTreeClus
model = nTreeClus.nTreeClus(list(df.sequence), n=None, ntree=5, method="All", verbose=1, C=n_clusters)
model.nTreeClus()

Matrix Segmentation (Splitting based on window size):   6%|▌         | 873/14993 [00:00<00:03, 4435.79it/s]

Finding the parameter 'n'
Parameter 'n' is set to 18


Matrix Segmentation (Splitting based on window size): 100%|██████████| 14993/14993 [00:04<00:00, 3588.38it/s]


one-hot encoding + x/y train
Fit DT
DataFrame of terminal nodes
Determining the cosine Distance
Applying Ward Linkage
Cutting The Tree
Fit DT + POSITION
DataFrame of terminal nodes
Determining the cosine Distance
Applying Ward Linkage
Cutting The Tree
Fit RF
DataFrame of terminal nodes
Determining the cosine Distance
Applying Ward Linkage
Cutting The Tree
Fit RF + POSITION
DataFrame of terminal nodes
Determining the cosine Distance
Applying Ward Linkage
Cutting The Tree


# Performance Report

In [3]:
labels = list(df.cluster)
model.performance(Ground_Truth = labels)

Unnamed: 0,F1S,ARS,RS,Pur,Sil,1NN
DT,1.0,1.0,1.0,1.0,0.921,1.0
RF,1.0,1.0,1.0,1.0,0.93,1.0
DT_p,1.0,1.0,1.0,1.0,0.815,1.0
RF_p,1.0,1.0,1.0,1.0,0.871,1.0


# Executing Time (Sec.)

In [4]:
print(f"""
      running_time Matrix Segmentation: {model.running_timeSegmentation} seconds,
      running_time DT: {model.running_timeDT} seconds,
      running_time DT_p: {model.running_timeDT_p} seconds,
      running_time RF: {model.running_timeRF} seconds,
      running_time RF_p: {model.running_timeRF_p} seconds.
      """)


      running_time Matrix Segmentation: 9 seconds,
      running_time DT: 69 seconds,
      running_time DT_p: 74 seconds,
      running_time RF: 136 seconds,
      running_time RF_p: 143 seconds.
      


In [5]:
import platform,json,psutil,logging,cpuinfo

def getSystemInfo():
    try:
        info={}
        info['platform']=platform.system()
        info['platform-release']=platform.release()
        info['platform-version']=platform.version()
        info['architecture']=platform.machine()
        info['processor']=f"Processor: {cpuinfo.get_cpu_info()['brand_raw']}"
        info['ram']=str(round(psutil.virtual_memory().total / (1024.0 **3)))+" GB"
        return json.dumps(info)
    except Exception as e:
        logging.exception(e)

json.loads(getSystemInfo())

{'platform': 'Linux',
 'platform-release': '4.9.0-11-amd64',
 'platform-version': '#1 SMP Debian 4.9.189-3 (2019-09-02)',
 'architecture': 'x86_64',
 'processor': 'Processor: Intel(R) Core(TM) i9-9900K CPU @ 3.60GHz',
 'ram': '126 GB'}