# Reading the Dataset

In [1]:
import pandas as pd

csv_available     = True
"""First unzip the dataset genecode.v39.csv.gz in the same folder."""
df                = pd.read_csv("genecode.csv")
df_copy           = df.copy()
most_repeated_seq = list(df_copy['gene-id'].value_counts()[:10].index)
filtered_df       = df_copy[df_copy['gene-id'].isin(most_repeated_seq)].copy()
filtered_df.reset_index(inplace=True, drop=True)
filtered_df.head(2)

Unnamed: 0,transcript-id,gene-id,"Havana-gene-id (if the gene contains manually annotated transcripts, '-' otherwise)","Havana-transcript-id (if this transcript was manually annotated, '-' otherwise)",transcript-name,gene-name,sequence-length,transcript biotype,sequence
0,ENST00000419542.5,ENSG00000179818.16,OTTHUMG00000153728.26,OTTHUMT00000329417.2,PCBP1-AS1-214,PCBP1-AS1,4041,lncRNA,CGCCATGTTTCCTGAACACAAAATGGCGACACGTGGTTAGCATTCG...
1,ENST00000682294.1,ENSG00000179818.16,OTTHUMG00000153728.26,-,PCBP1-AS1-440,PCBP1-AS1,4755,lncRNA,TTTTTGCCGGTTCTTCCCGCTGAGGAAGTGTCGCTCAAGACGCTGG...


# Fitting nTreeClus

In [2]:
import nTreeClus
from tqdm import tqdm
def convert_label_to_values (column):
    unique_lables    = list(column.unique())
    n_lables         = len(unique_lables)
    map_dict         = dict(zip(unique_lables, range(n_lables)))
    return column.map(map_dict)
n_clusters = len(filtered_df['gene-id'].unique())
labels     = list(convert_label_to_values(filtered_df['gene-id']))
model      = nTreeClus.nTreeClus(list(filtered_df.sequence), n=None, ntree=10, method="All", verbose=1, C=n_clusters)
model.nTreeClus()

print(model.performance(Ground_Truth = labels), f'\n ############ \n\n')

Matrix Segmentation (Splitting based on window size):   3%|▎         | 60/1728 [00:00<00:02, 564.22it/s]

Finding the parameter 'n'
Parameter 'n' is set to 42


Matrix Segmentation (Splitting based on window size): 100%|██████████| 1728/1728 [00:04<00:00, 359.00it/s]


one-hot encoding + x/y train
Fit DT
DataFrame of terminal nodes
Determining the cosine Distance
Applying Ward Linkage
Cutting The Tree
Fit DT + POSITION
DataFrame of terminal nodes
Determining the cosine Distance
Applying Ward Linkage
Cutting The Tree
Fit RF
DataFrame of terminal nodes
Determining the cosine Distance
Applying Ward Linkage
Cutting The Tree
Fit RF + POSITION
DataFrame of terminal nodes
Determining the cosine Distance
Applying Ward Linkage
Cutting The Tree
        F1S    ARS     RS    Pur    Sil    1NN
DT    0.859  0.690  0.936  0.852  0.491  0.999
RF    0.860  0.696  0.938  0.855  0.498  0.999
DT_p  0.872  0.724  0.945  0.867  0.491  0.999
RF_p  0.870  0.718  0.943  0.865  0.491  0.999 
 ############ 




# Executing Time (Sec.)

In [3]:
print(f"""
      running_time Matrix Segmentation: {model.running_timeSegmentation},
      running_time DT: {model.running_timeDT},
      running_time DT_p: {model.running_timeDT_p},
      running_time RF: {model.running_timeRF},
      running_time RF_p: {model.running_timeRF_p}.
      """)


      running_time Matrix Segmentation: 18,
      running_time DT: 96,
      running_time DT_p: 100,
      running_time RF: 396,
      running_time RF_p: 397.
      


In [4]:
import platform,json,psutil,logging,cpuinfo

def getSystemInfo():
    try:
        info={}
        info['platform']=platform.system()
        info['platform-release']=platform.release()
        info['platform-version']=platform.version()
        info['architecture']=platform.machine()
        info['processor']=f"Processor: {cpuinfo.get_cpu_info()['brand_raw']}"
        info['ram']=str(round(psutil.virtual_memory().total / (1024.0 **3)))+" GB"
        return json.dumps(info)
    except Exception as e:
        logging.exception(e)

json.loads(getSystemInfo())

{'platform': 'Linux',
 'platform-release': '4.9.0-11-amd64',
 'platform-version': '#1 SMP Debian 4.9.189-3 (2019-09-02)',
 'architecture': 'x86_64',
 'processor': 'Processor: Intel(R) Core(TM) i9-9900K CPU @ 3.60GHz',
 'ram': '126 GB'}