In [2]:
# setting up imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [4]:
# load in the data 

promoters = pd.read_csv('Data/promoter.csv')
promoters.columns = ['Sequence']
non_promoters = pd.read_csv('Data/non_promoter.csv')
non_promoters.columns = ['Sequence']
label_vector = np.ones(promoters.shape[0]).tolist() + np.zeros(non_promoters.shape[0]).tolist()
combined_data = pd.concat([promoters, non_promoters], axis=0)

In [5]:
# train test validation split
from sklearn.model_selection import train_test_split
X_train_val, X_test, y_train_val, y_test = train_test_split(
    combined_data, label_vector, test_size=0.2, stratify=label_vector, random_state=42
)
X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val, test_size=0.125, stratify=y_train_val, random_state=42
)  # 0.125 because 0.125 * 80% = 10%

# Check the sizes
print(f"Train size: {len(X_train)}")
print(f"Validation size: {len(X_val)}")
print(f"Test size: {len(X_test)}")

Train size: 42000
Validation size: 6000
Test size: 12000


In [6]:
y_train = np.array(y_train)

In [9]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import RandomForestClassifier
import numpy as np

class KmerVectorizer(BaseEstimator, TransformerMixin):
    """
    Custom transformer to vectorize k-mers for different K values.
    """
    def __init__(self, K=3):
        self.K = K
        self.vectorizer = None

    def fit(self, X, y=None):
        self.vectorizer = CountVectorizer(analyzer='char', ngram_range=(self.K, self.K))
        self.vectorizer.fit(X)
        return self

    def transform(self, X):
        return self.vectorizer.transform(X)

def gridsearch_with_k_cross_validation(combined_data, label_vector, model, param_grid, cv=5, scoring='accuracy'):
    """
    Perform grid search with cross-validation over k-mer sizes (K) and model hyperparameters.
    
    Parameters:
    - combined_data: list or pd.Series, input string data.
    - label_vector: list or array, labels corresponding to the combined_data.
    - model: sklearn estimator, the machine learning model to train.
    - param_grid: dict, parameters to search in grid search (including K).
    - cv: int, number of cross-validation folds.
    - scoring: str, scoring metric for grid search.
    
    Returns:
    - best_model: The best model from grid search.
    - best_params: The best parameters from grid search.
    - best_score: The best cross-validation score.
    """
    # Create a pipeline for k-mer vectorization and the model
    pipeline = Pipeline([
        ('kmer', KmerVectorizer()),  # KmerVectorizer allows dynamic K
        ('model', model)
    ])

    # Perform grid search
    grid_search = GridSearchCV(pipeline, param_grid, cv=cv, scoring=scoring, n_jobs=100)
    grid_search.fit(combined_data, label_vector)

    return grid_search.best_estimator_, grid_search.best_params_, grid_search.best_score_


In [11]:
# training Naive Bayes model
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
param_grid = {
    'kmer__K': [ 4,  6, 8, 10, 12, 14, 16],
    'model__alpha': [0.1, 0.5, 1.0],
}
best_model, best_params, best_score = gridsearch_with_k_cross_validation(
    combined_data=X_train['Sequence'].values,
    label_vector=y_train,
    model=model,
    param_grid=param_grid,
    cv=5,
    scoring='accuracy'
)
# saving the model
import joblib
joblib.dump(best_model, 'best_naive_bayes_model.pkl')


In [12]:
print("Best Parameters:", best_params)
print("Best Cross-Validation Score:", best_score)


Best Parameters: {'kmer__K': 12, 'model__alpha': 0.1}
Best Cross-Validation Score: 0.8374285714285714


In [13]:
# training Random Forest model
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(random_state=42)
param_grid = {
    'kmer__K': [4, 8, 12, 16],
    'model__n_estimators': [50, 100, 200],
    'model__max_depth': [None, 10, 20],
    'model__min_samples_split': [2],
}
best_model, best_params, best_score = gridsearch_with_k_cross_validation(
    combined_data=X_train['Sequence'].values,
    label_vector=y_train,
    model=model,
    param_grid=param_grid,
    cv=5,
    scoring='accuracy'
)
# saving the model
joblib.dump(best_model, 'best_random_forest_model.pkl')

KeyboardInterrupt: 

In [None]:
print("Best Parameters:", best_params)
print("Best Cross-Validation Score:", best_score)

In [None]:
# training SVM model
from sklearn.svm import SVC
model = SVC(probability=True)
param_grid = {
    'kmer__K': [ 6, 8, 10, 12, 14, 16],
    'model__C': [0.1, 1, 10],
    'model__gamma': ['scale', 'auto'],
    'model__kernel': ['linear', 'rbf', 'poly'],
}

best_model, best_params, best_score = gridsearch_with_k_cross_validation(
    combined_data=X_train['Sequence'].values,
    label_vector=y_train,
    model=model,
    param_grid=param_grid,
    cv=5,
    scoring='accuracy'
)
# saving the model
joblib.dump(best_model, 'best_svm_model.pkl')


In [None]:
print("Best Parameters:", best_params)
print("Best Cross-Validation Score:", best_score)