# Gapped k-mer Support Vector Machine (gkSVM) for Enhancer Activity Prediction

**Authorship:**
Adam Klie, *08/02/2021*
***
**Description:**
    Notebook to train gkSVM classsifiers for predicting enhancer activity based on the implmentation in https://github.com/Dongwon-Lee/lsgkm/.

 - Currentlly allows the user to create a custom script with gkSVM hyperparameters
***
**TODOs:**
 - <font color='red'> Figure out how to work with kwargs in future </font>
***

In [2]:
# Classics
import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

# Autoreload extension
if 'autoreload' not in get_ipython().extension_manager.loaded:
    %load_ext autoreload
    
%autoreload 2

# Load local libraries
import sys
sys.path.append('/cellar/users/aklie/projects/EUGENE/bin/')
import project_utils

# Set-up

In [216]:
# Data params
DATASET = "2021_OLS_Library"  # Which dataset to look at
FEATURES = "fasta"  # What features to use to train the model
LABELS = "binary"
PREPROCESS = "0.09-0.4"  # Preprocessing steps, separated by "-"
SPLIT = 0.9
SUBSET = False

In [229]:
# gkSVM hyperparams. See documentation for what each means
KERNEL="2"
WORD_LENGTH="11"
INFORM_COLS="7"
MISMATCHES="3"
REG_PARAM="1.0"
POS_WEIGHT="1.345"
REV_COMP=True
HYPERPARAM = "{}-{}-{}-{}-{}-{}-{}".format(KERNEL, 
                                           WORD_LENGTH, INFORM_COLS, MISMATCHES, 
                                           REV_COMP,
                                           REG_PARAM, POS_WEIGHT)
ARCHITECTURE = "gkSVM"

In [230]:
# Output directory
OUTDIR="{}_{}_{}_{}".format(PREPROCESS, FEATURES, HYPERPARAM, ARCHITECTURE)
if not os.path.exists(OUTDIR):
    print("Making directory {}".format(OUTDIR))
    os.makedirs(OUTDIR)
else:
    print("Directory already exists")

Directory already exists


In [231]:
# Define paths to train and validation data. Train data must be split into each class
in_dir = "/cellar/users/aklie/projects/EUGENE/data/{0}/fasta".format(DATASET)
pos = "{0}_X-train-{1}_{2}-pos.fa".format(PREPROCESS, SPLIT, FEATURES)
neg = "{0}_X-train-{1}_{2}-neg.fa".format(PREPROCESS, SPLIT, FEATURES)
val = "{0}_X-test-{1}_{2}.fa".format(PREPROCESS, round(1-SPLIT, 1), FEATURES)
in_dir, pos, neg, val

('/cellar/users/aklie/projects/EUGENE/data/2021_OLS_Library/fasta',
 '0.09-0.4_X-train-0.9_fasta-pos.fa',
 '0.09-0.4_X-train-0.9_fasta-neg.fa',
 '0.09-0.4_X-test-0.1_fasta.fa')

# Train gkSVM model with script

In [232]:
project_utils.generate_slurm_train_script(in_dir, pos, neg, val, OUTDIR, HYPERPARAM, PREPROCESS)

Successfully generated 0.09-0.4_fasta_2-11-7-3-True-1.0-1.345_gkSVM/train_0.09-0.4_fasta_gkSVM-clf_2-11-7-3-True-1.0-1.345.sh
Usage: sbatch train_0.09-0.4_fasta_gkSVM-clf_2-11-7-3-True-1.0-1.345.sh --job-name=train_0.09-0.4_fasta_gkSVM-clf_2-11-7-3-True-1.0-1.345 -o 0.09-0.4_fasta_2-11-7-3-True-1.0-1.345_gkSVM/train_0.09-0.4_fasta_gkSVM-clf_2-11-7-3-True-1.0-1.345.out -e 0.09-0.4_fasta_2-11-7-3-True-1.0-1.345_gkSVM/train_0.09-0.4_fasta_gkSVM-clf_2-11-7-3-True-1.0-1.345.err --mem=20G


In [233]:
#!sbatch train_0.09-0.4_fasta_gkSVM-clf_2-11-7-3-True-1.0-1.345.sh --job-name=train_0.09-0.4_fasta_gkSVM-clf_2-11-7-3-True-1.0-1.345 --mem=20G

# Validation Set performance

In [234]:
model = "{}_{}_{}-clf_{}".format(PREPROCESS, FEATURES, ARCHITECTURE, HYPERPARAM)
model_name = os.path.join(OUTDIR, model)
pos_tr_file = "{}.train-pos.predict.txt".format(model_name)
neg_tr_file = "{}.train-neg.predict.txt".format(model_name)
val_file = "{}.test.predict.txt".format(model_name)
pos_tr_file, neg_tr_file, val_file

('0.09-0.4_fasta_2-11-7-3-True-1.0-1.345_gkSVM/0.09-0.4_fasta_gkSVM-clf_2-11-7-3-True-1.0-1.345.train-pos.predict.txt',
 '0.09-0.4_fasta_2-11-7-3-True-1.0-1.345_gkSVM/0.09-0.4_fasta_gkSVM-clf_2-11-7-3-True-1.0-1.345.train-neg.predict.txt',
 '0.09-0.4_fasta_2-11-7-3-True-1.0-1.345_gkSVM/0.09-0.4_fasta_gkSVM-clf_2-11-7-3-True-1.0-1.345.test.predict.txt')

## **Training scores**

In [235]:
# Make sure ordering is correct
pos_tr_scores = project_utils.get_scores(pos_tr_file)
neg_tr_scores = project_utils.get_scores(neg_tr_file)

In [236]:
y_tr_probs = np.array(pos_tr_scores + neg_tr_scores)
y_tr_preds = (y_tr_probs >= 0).astype(int)
y_train = np.array([1]*len(pos_tr_scores) + [0]*len(neg_tr_scores))
print(y_train.shape, y_tr_probs.shape, y_tr_preds.shape)

(177793,) (177793,) (177793,)


## **Test scores**

In [237]:
# Make sure ordering is correct
y_probs = np.array(project_utils.get_scores(val_file))

In [238]:
y_test = np.loadtxt('../data/{0}/{1}/{2}_y-test-{3}_{1}.txt'.format(DATASET, LABELS, PREPROCESS, round(1-SPLIT, 1)), dtype=int)
#y_test = np.random.randint(0, 2, size=len(val_scores))

In [239]:
y_preds = (y_probs >= 0).astype(int)
print(y_test.shape, y_probs.shape, y_preds.shape)

(19755,) (19755,) (19755,)


In [240]:
project_utils.classification_report(out_path="{}".format(OUTDIR),
                                    train_X=None, test_X=None, 
                                    train_y=y_train, test_y=y_test,
                                    train_preds=y_tr_preds, test_preds=y_preds,
                                    train_probs=y_tr_probs, test_probs=y_probs)

Predictions provided, skipping them
Generating confusion matrix
Calculating classification metrics
Metric	Train	Test
Accuracy	0.7304	0.7215
Precision	0.6991	0.6897
Recall	0.7635	0.7493
F0.1-Score	0.6997	0.6903
F0.5-Score	0.7111	0.7009
F1-Score	0.7299	0.7183
F2-Score	0.7497	0.7365
F10-Score	0.7628	0.7486
Plotting PR Curve
Plotting ROC Curve
Generating report


# Scratch

In [None]:
def generate_slurm_train_script(input_dir,
                                pos_seqs,
                                neg_seqs,
                                val_seqs,
                                result_dir,
                                hyperparams,
                                preprocess,
                                features="fasta",
                                architecture="gkSVM"):
    
    # Set up model name
    model = "{}_{}_{}-clf_{}".format(preprocess, features, architecture, hyperparams)
    model_name = os.path.join(result_dir, model)
    
    # Set up hyperparams
    hyperparams = hyperparams.split("-")
    if hyperparams[4]:
        hyperparams.remove("True")
        hyperparams = "-t {} -l {} -k {} -d {} -R -c {} -w {}".format(*hyperparams)
    else:
        hyperparams.remove("False")
        hyperparams = "-t {} -l {} -k {} -d -c {} -w {}".format(*hyperparams)
        
    # Set up file pointers
    output = ["#!/bin/bash", "#SBATCH --cpus-per-task=16", "#SBATCH --time=48:00:00",
              "#SBATCH --partition carter-compute", "#SBATCH -o ./out/%x.out", "#SBATCH -e ./err/%x.err\n"]
    output += ['date\necho -e "Job ID: $SLURM_JOB_ID\\n"\n']
    output += ["trainposseqs={}".format(os.path.join(input_dir, pos_seqs)),
               "trainnegseqs={}".format(os.path.join(input_dir, neg_seqs)),
               "valseqs={}".format(os.path.join(input_dir, val_seqs)),
               "resultdir={}".format(result_dir),
               "modelname={}".format(model_name)]
    output += ["[ ! -d $resultdir ] && mkdir $resultdir\n"]
    
    # Set-up training command
    train_command = "gkmtrain $trainposseqs $trainnegseqs $modelname {} -v 2 -T $SLURM_CPUS_PER_TASK -m 8000.0".format(hyperparams)
    output += ["echo -e {}".format(train_command)]
    output += [train_command]
    output += ['echo -e "\\n"\n']
    
    # Set up positive train seq predict
    predict_pos_train_command = 'gkmpredict $trainposseqs $modelname".model.txt" $modelname".train-pos.predict.txt"'
    output += ["echo -e {}".format(predict_pos_train_command)]
    output += [predict_pos_train_command]
    output += ['echo -e "\\n"\n']
    
    # Set up negative train seq predict
    predict_neg_train_command = 'gkmpredict $trainnegseqs $modelname".model.txt" $modelname".train-neg.predict.txt"'
    output += ["echo -e {}".format(predict_neg_train_command)]
    output += [predict_neg_train_command]
    output += ['echo -e "\\n"\n']
    
    # Set up val seq predict
    predict_val_command = 'gkmpredict $valseqs $modelname".model.txt" $modelname".test.predict.txt"'
    output += ["echo -e {}".format(predict_val_command)]
    output += [predict_val_command]
    output += ['echo -e "\\n"\n']
    
    output += ["date"]
    
    # Write to script
    with open("train_{}".format(model), "w") as f:
        f.write("\n".join(output))
        
    # Bash command to edit
    print("sbatch train_{0} --job-name=train_{0} --mem=20G".format(model))

# References