# ML classification
- load tokenizers
- tokenize data
- load data, split into train, test
- pick classifier (by running multiple tests)
- predict on test data (to determine the accuracy)
- pick the best vocabulary (tokenizer) to use


https://huggingface.co/transformers/preprocessing.html

## Imports and Paths

In [16]:
import os
import IPython

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from tokenizers import CharBPETokenizer
from tokenizers import ByteLevelBPETokenizer

from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import accuracy_score, f1_score
from sklearn import metrics
from keras.preprocessing.sequence import pad_sequences

from yellowbrick.classifier import ROCAUC

from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.kernel_ridge import KernelRidge
from sklearn.neighbors import KNeighborsClassifier
from scipy.stats import uniform

from hpsklearn import HyperoptEstimator

RANDOM_STATE = 42

In [17]:
RootFolder = "/home/lieberze/DP/Thesis/"

DataFolder = os.path.abspath(os.path.join(RootFolder, 'model_training/data/512_bp_for_encoding/NEW/'))
FileToEncode_1M = os.path.abspath(os.path.join(DataFolder, 'All_1M.txt'))
# FileToEncode_3M = os.path.abspath(os.path.join(DataFolder, 'All_3M.txt'))

AttemptFolder = os.path.abspath(os.path.join(RootFolder, "tokenizery_new_data/"))
EncodedFolder_1M = os.path.abspath(os.path.join(AttemptFolder, "data/sample/Encoding/Encoded_512bp_1M_lines/NEW/"))

FolderName = "All_genomes_sample"
name = "01_CharBPE"
CharBPE = os.path.abspath(os.path.join(AttemptFolder, f'{name}/{FolderName}'))
All_512_BPE = os.path.abspath(os.path.join(CharBPE, 'All_512/'))
name = "02_ByteLevelBPE"
ByteLevelBPE = os.path.abspath(os.path.join(AttemptFolder, f'{name}/{FolderName}'))
All_512_BLBPE = os.path.abspath(os.path.join(ByteLevelBPE, 'All_512/'))

## Tokenize Data

In [18]:
def LoadTokenizer(TokenizerPath, Tokenizer):
    vocab = f"{TokenizerPath}/vocab.json"
    merges = f"{TokenizerPath}/merges.txt"
    tokenizer = Tokenizer(vocab, merges)    
    return tokenizer

def ModifyPath(TokenizerPath, TokenizerName, FolderForEncoded):
    EncodedPath = TokenizerPath.strip("/").split("/")[-2:]
    EncodedPath = TokenizerName + "/" + "/".join(EncodedPath)
    EncodedPath = os.path.abspath(os.path.join(FolderForEncoded, EncodedPath))
    return EncodedPath

def EncodeFile(TokenizerPath, Tokenizer, FileToEncodePath, EncodedFilesPath):
    !mkdir -p {EncodedFilesPath}
    print(EncodedFilesPath)
    Tokenizer = LoadTokenizer(TokenizerPath, Tokenizer)
    unk = 0
    with open(FileToEncodePath, "r") as file_in,\
        open(EncodedFilesPath + "/encoded.txt", "w") as file_out:
        for Line in file_in:
            LineSplit = Line.strip().split()
            SeqType, Seq = LineSplit[0], LineSplit[-1]
            Encoded = Tokenizer.encode(Seq)    
            if "<unk>" in Encoded.tokens:
                 unk += 1
            file_out.write(SeqType + "\t" + str(Encoded.ids) + "\n")
    print(f"There are {unk} unk tokens in this file")

Next chunk is commented out since it had to be run only one time.

The run output showed that there are no \<unk> tokens in any of the loaded files

In [21]:
# VocabSizes = [5000, 15000, 50000]
# FileToEncode = FileToEncode_1M
# FolderForEncoded = EncodedFolder_1M

# # TokenizerName = "CharBPE"
# # Paths = [All_512_BPE]
# # Tokenizer = CharBPETokenizer
# # for Path in Paths:
# #     for Size in VocabSizes:
# #         TokenizerPath = f"{Path}/{Size}/"
# #         EncodedPath = ModifyPath(TokenizerPath, TokenizerName, FolderForEncoded)
# #         EncodeFile(TokenizerPath, Tokenizer, FileToEncode, EncodedPath)
        
# TokenizerName = "ByteLevelBPE"
# Paths = [All_512_BLBPE]
# Tokenizer = ByteLevelBPETokenizer
# for Path in Paths:
#     for Size in VocabSizes:
#         TokenizerPath = f"{Path}/{Size}/"
#         EncodedPath = ModifyPath(TokenizerPath, TokenizerName, FolderForEncoded)
#         EncodeFile(TokenizerPath, Tokenizer, FileToEncode, EncodedPath)

Show paths of all the files

In [20]:
Names = ["CharBPE", "ByteLevelBPE"]
BasePairLengths = ["All_512"]
VocabSizes = [5000, 15000, 50000]

Paths = []
for Name in Names:
    for BasePairLength in BasePairLengths:
        for Size in VocabSizes:
            Location = Name + "/" + BasePairLength + "/" + str(Size) + "/"
            EncodedPath = os.path.abspath(os.path.join(EncodedFolder_1M, Location))
            Paths.append(EncodedPath)
            print(EncodedPath)

/home/lieberze/DP/Thesis/tokenizery_new_data/data/sample/Encoding/Encoded_512bp_1M_lines/NEW/CharBPE/All_512/5000
/home/lieberze/DP/Thesis/tokenizery_new_data/data/sample/Encoding/Encoded_512bp_1M_lines/NEW/CharBPE/All_512/15000
/home/lieberze/DP/Thesis/tokenizery_new_data/data/sample/Encoding/Encoded_512bp_1M_lines/NEW/CharBPE/All_512/50000
/home/lieberze/DP/Thesis/tokenizery_new_data/data/sample/Encoding/Encoded_512bp_1M_lines/NEW/ByteLevelBPE/All_512/5000
/home/lieberze/DP/Thesis/tokenizery_new_data/data/sample/Encoding/Encoded_512bp_1M_lines/NEW/ByteLevelBPE/All_512/15000
/home/lieberze/DP/Thesis/tokenizery_new_data/data/sample/Encoding/Encoded_512bp_1M_lines/NEW/ByteLevelBPE/All_512/50000


## Pick best model and hyperparameters

In [6]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.kernel_ridge import KernelRidge
from sklearn.svm import SVC
from sklearn.linear_model import RidgeClassifier

model_grid = {    
    # https://medium.com/all-things-ai/in-depth-parameter-tuning-for-random-forest-d67bb7e920d
    RandomForestClassifier(n_jobs=-1, random_state=RANDOM_STATE): {
            "n_estimators": [150, 200, 400],
            "max_depth": [1, 3, 5, 7, 9],
            "criterion": ["gini", "entropy"],
            "min_samples_split": [0.6, 0.8, 2, 4],
            "min_samples_leaf": [0.4, 1, 2, 4, 10, 15]
        },    
    # https://www.analyticsvidhya.com/blog/2016/02/complete-guide-parameter-tuning-gradient-boosting-gbm-python/
    GradientBoostingClassifier(max_features = 'sqrt', subsample = 0.8, random_state=RANDOM_STATE): {
            "min_samples_split": [3000, 4000, 5000, 6000],
            "min_samples_leaf": [50, 100, 150, 200],
            "max_depth": [5, 6, 7, 8]
        },        
    RidgeClassifier(random_state=RANDOM_STATE): {
            "alpha": [1e-3, 1e-2, 1e-1, 1]
        },    
    # # KNN - memory problems
    # # It is advised to use the KNN algorithm for multiclass 
    # # classification if the number of samples of the data is less than 50,000
    # KNeighborsClassifier(n_jobs=-1):{
    #     "n_neighbors": [3, 5, 10, 20, 30],
    #     "leaf_size": [5, 10, 20, 30, 40],
    #     "metric": []
    # },    
    # # https://www.mdelcueto.com/blog/kernel-ridge-regression-tutorial/
    # # TerminatedWorkerError: A worker process managed by the executor was unexpectedly terminated.
    # # This could be caused by a segmentation fault while calling the function or by an excessive memory
    # # usage causing the Operating System to kill the worker.
    # KernelRidge(): {
    #     "kernel": ['poly', 'rbf', 'linear'],
    #     "degree": [2,3,4,5],
    #     "alpha": [1e-4, 1e-3, 1e-2, 1e-1, 1]
    # }   
}

In [23]:
# for path in paths   
def LoadData(Path):
    SeqTypes, Ids = [], []
    with open(Path, "r") as file:
        for line in file:
            s, i = line.strip().split("\t")
            i = np.array(i.strip("[]").split(", ")).reshape(-1,1)
            SeqTypes.append(s), Ids.append(i)
            
    # adds 1 (for both algorithms it is the id of the <pad> token, post == at the end
    X = pad_sequences(Ids, value=1, padding='post')
    nsamples, nx, ny = X.shape
    X = X.reshape((nsamples,nx*ny)) # from 3 to 2 dimensions
    y = np.array(SeqTypes)            
    return X, y

# https://stackoverflow.com/questions/35388647/how-to-use-gridsearchcv-output-for-a-scikit-prediction
# https://stackoverflow.com/questions/64950438/how-does-randomizedsearchcv-decide-what-the-best-parameters-are
# RS-CV uses accuracy for classification
# https://www.baeldung.com/cs/multi-class-f1-score
# The class F-1 scores are averaged by using the number of instances in a class as weights
def TryClassifiers(model_grid, random_state, X_train, y_train):
    kf = KFold(n_splits=5, random_state=0, shuffle=True)
    BestScores = []
    for clf, grid in model_grid.items():
        model = RandomizedSearchCV(estimator=clf, 
                                   param_distributions=grid,
                                   n_iter=5, 
                                   cv=kf,
                                   verbose=1, 
                                   n_jobs=-1,
                                   random_state=random_state, 
                                   scoring = "f1_weighted",
                                  )
        model.fit(X_train, y_train)     
        best_score = model.best_score_
        best_params = model.best_estimator_.get_params()
        BestScores.append([best_score, clf, best_params])
    return BestScores

def PickBest(BestScores):    
    top_model = sorted(BestScores, key=lambda x: x[0], reverse=True)[0]
    # load model with best params
    validation_score, model, params = top_model
    model.set_params(**params)
    print(model)
    print("validation f1_score (weighted):", validation_score)
    model = model.fit(X_train, y_train)
    return model
      
def FitPredictGetMetrics(model, X_test, y_test, PathToSave):
    y_pred = model.predict(X_test)
    
    # metrics 
    # acc_score = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    cm = metrics.confusion_matrix(y_test, y_pred, normalize='true')
    report = metrics.classification_report(y_test,y_pred,digits=2, output_dict=True,zero_division=0)
    # metrics.plot_confusion_matrix(model, X_test, y_test) 
    
    print("test f1_score (weighted):", f1)
    
    # report table (f1, precision, recall)
    df_report = pd.DataFrame(report).transpose()
    df_report.to_pickle(PathToSave + "/report.pkl")
    weighted_F1_average = df_report["f1-score"]["weighted avg"]
    print(df_report)
    
    # SHOW THE DIFFERENCES
    # exon, intron, intergenic = df_report["f1-score"].exon, df_report["f1-score"].intron, df_report["f1-score"].intergenic
    # exon_vs_rest_ratio = exon/(intron + intergenic)
    # exon_vs_rest_f1_score = exon, intron + intergenic, exon_vs_rest_ratio
    
    # confusion matrix
    plt.figure(figsize=(8, 6))
    df_cm = pd.DataFrame(cm)
    df_cm.columns = ['exon', 'intergenic','intron']
    df_cm.index = ['exon', 'intergenic','intron']
    plt.title('Confusion Matrix, normalized', size=16)
    sns.heatmap(df_cm, annot=True, cmap='Blues')
    plt.savefig(PathToSave + '/confusion_matrix.png', transparent=False, dpi=80, bbox_inches="tight")
    plt.show()    
      
    return weighted_F1_average

# https://www.scikit-yb.org/en/latest/api/classifier/index.html    
def ROCAUCcurve(model, X_train, y_train, X_test, y_test, PathToSave):
    visualizer = ROCAUC(model, classes=["exon", "intergenic", "intron"])
    visualizer.fit(X_train, y_train)        # Fit the training data to the visualizer
    visualizer.score(X_test, y_test)        # Evaluate the model on the test data
    plt.savefig(PathToSave + '/ROC_curve.png', transparent=False, dpi=80, bbox_inches="tight")
    visualizer.show()
    
#     # Call finalize to draw the final yellowbrick-specific elements
#     model.finalize()

#     # Get access to the axes object and modify labels
#     model.ax.set_xlabel("measured concrete strength")
#     model.ax.set_ylabel("predicted concrete strength")
#     plt.savefig("peplot.pdf")

    # from yellowbrick.classifier import PrecisionRecallCurve
    # visualizer = PrecisionRecallCurve(model, classes=["exon", "intergenic", "intron"])
    # visualizer.fit(X_train, y_train)        # Fit the training data to the visualizer
    # visualizer.score(X_test, y_test)        # Evaluate the model on the test data
    # visualizer.show()

In [8]:
# # https://inblog.in/AUC-ROC-score-and-curve-in-multiclass-classification-problems-2ja4jOHb2X

# #importing all the necessary libraries
# from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
# from sklearn.metrics import roc_curve, roc_auc_score

# # model.fit(X_tr, y_tr)

# # predicting the data
# y_pred_cnb = model.predict(X_te)
# y_prob_pred_cnb = model.predict_proba(X_te)

# #roc auc score
# roc_auc_score(y_te, y_prob_pred_cnb, multi_class='ovo', average='weighted')

In [9]:
# # POSSIBLE METRICS FOR RANDOMIZEDSEARCHCV
# import sklearn.metrics
# sklearn.metrics.SCORERS.keys()

z cross validae leze spise validacni error (Vašata)

balancovaný dataset (zmínit)

ROC krivky, recall, precision, F1, confusion matrix 3x3

pro kazdy soubot natrenovat jiny model

In [24]:
import joblib

In [11]:
# train_size = 150
# test_size = 50
# measures = []
# for Path in Paths:
#     print(Path)
#     # X, y = LoadData(Path + "/encoded.txt")
# #     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state = RANDOM_STATE)
    
# #     #smaller_sample
# #     X_tr, y_tr =  X_train[:train_size], y_train[:train_size]
# #     X_te, y_te = X_test[:test_size], y_test[:test_size]

#     # BestScores = TryClassifiers(model_grid, RANDOM_STATE, X_tr, y_tr)
#     model = PickBest(BestScores)
#     # save model
#     filename = '/classification_model.sav'
#     joblib.dump(model, Path + filename)
    
#     exon_vs_rest_f1_score_ = FitPredictGetMetrics(model, X_te, y_te)
#     measures.append(exon_vs_rest_f1_score_)
#     # ROCAUCcurve(model, X_tr, y_tr, X_te, y_te)
        
#     break

In [12]:
# measures

## Load any model

In [13]:
# filename = Path + '/classification_model.sav'

# # load the model from disk
# loaded_model = joblib.load(filename)
# # result = loaded_model.score(X_te, y_te)
# print(loaded_model)

## Results for 150k lines. With ROC curves

In [26]:
# Paths[3:]

In [None]:
train_size = 150000
test_size = 50000
measures = []
for Path in Paths[3:]:
    print(Path)
    X, y = LoadData(Path + "/encoded.txt")
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state = RANDOM_STATE)
    
    #smaller_sample
    X_tr, y_tr =  X_train[:train_size], y_train[:train_size]
    X_te, y_te = X_test[:test_size], y_test[:test_size]

    BestScores = TryClassifiers(model_grid, RANDOM_STATE, X_tr, y_tr)
    model = PickBest(BestScores)
    
    # save model
    filename = '/classification_model.sav'
    joblib.dump(model, Path + filename)
    
    weighted_f1_average_ = FitPredictGetMetrics(model, X_te, y_te, Path)
    measures.append(weighted_f1_average_)
    
    ROCAUCcurve(model, X_tr, y_tr, X_te, y_te, Path)

/home/lieberze/DP/Thesis/tokenizery_new_data/data/sample/Encoding/Encoded_512bp_1M_lines/NEW/ByteLevelBPE/All_512/5000
Fitting 5 folds for each of 5 candidates, totalling 25 fits
Fitting 5 folds for each of 5 candidates, totalling 25 fits
Fitting 5 folds for each of 4 candidates, totalling 20 fits




GradientBoostingClassifier(max_depth=8, max_features='sqrt',
                           min_samples_leaf=100, min_samples_split=3000,
                           random_state=42, subsample=0.8)
validation f1_score (weighted): 0.41271801036612565


## Continue the computation (it stopped again)

## Load saved model
classification_model.sav

In [None]:
# for Path in Paths:
#     print(Path + "/classification_model.sav")

In [None]:
# train_size = 150000
# test_size = 50000
# measures = []
# for Path in Paths[1:]:
#     print(Path)
#     X, y = LoadData(Path + "/encoded.txt")
#     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state = RANDOM_STATE)
    
#     #smaller_sample
#     X_tr, y_tr =  X_train[:train_size], y_train[:train_size]
#     X_te, y_te = X_test[:test_size], y_test[:test_size]
    
#     # load model
#     model = joblib.load(Path + "/classification_model.sav")
#     print(model)
    
#     exon_vs_rest_f1_score_ = FitPredictGetMetrics(model, X_te, y_te, Path)
#     measures.append(exon_vs_rest_f1_score_)
    
#     ROCAUCcurve(model, X_tr, y_tr, X_te, y_te, Path)

## Pick the best score (tokenizer)

In [None]:
# # how to evaluate? look at the ratio (measures[2])?
# for p, m in zip(Paths, measures):
#     print(p)
#     print(m)

In [None]:
# new_measures = []
# for p, m in zip(Paths, measures):
#     X = "-".join(p.split("/")[-3:])
#     new_measures.append([X, m])
# new_measures

In [None]:
# measures
# zprumerovat do diplomky(?)
# vzit neco z 512 bp

In [None]:
# sorted_by_third = sorted(new_measures, key=lambda tup: tup[1][2], reverse=True)
# sorted_by_third

## Show saved results

In [None]:
import pickle as pkl
from PIL import Image

In [None]:
avgs = []
for Path in Paths:
    print(Path)
    with open(Path + '/report.pkl', 'rb') as f:
        report = pickle.load(f)
        
    # print(report)    
    w_a = report["f1-score"]["weighted avg"]
    print(report)
    avgs.append( [Path, w_a])
          
    conf_m = Image.open(Path + '/confusion_matrix.png')
    roc = Image.open(Path + '/ROC_curve.png')
    
    conf_m.show()
    roc.show() 

In [None]:
avgs