# Course: Bioinformatics for Translational Medicine
## Date: April-May 2022

The assignment was to create a machine learning model for a cancer classification task. This script aims to test several classification models and feature selection methods with a grid search and cross-validation. The best combination is selected, hyperparameters are optimized, and trained.

## Environment setup

In [None]:
# General
import numpy as np
import pandas as pd
import joblib

# Cross-validation
from sklearn.pipeline import Pipeline # model steps
from sklearn.model_selection import GridSearchCV # grid search
from sklearn.model_selection import RepeatedKFold # inner cross validation

# Feature selection and classification
from sklearn.feature_selection import SelectFromModel # selection based on model
from sklearn.svm import LinearSVC # linear SVM feature selection             
from sklearn.linear_model import SGDClassifier # linear SVM classifier

## Load and prepare data

In [None]:
# Load data.
train_call = pd.read_csv("Data/Train_call.txt", sep="\t")
# Load labels.
train_clinical = pd.read_csv("Data/Train_clinical.txt", sep="\t")

In [None]:
# Format data correctly (remove unnecessary columns).
# train_features array should contain 100 samples and 2834 features
train_features = train_call.loc[:, 'Array.129':'Array.130'] 
train_features = train_features.transpose() # to get array of 100 samples x 2834 features
print("shape of train features data: ", train_features.shape) # 100 x 2834

# train_label array should contain 100 classlabels
train_labels = train_clinical.set_index('Sample')
train_labels.index.names = [None]
print("shape of train labels data: ", train_labels.shape)

shape of train features data:  (100, 2834)
shape of train labels data:  (100, 1)


In [None]:
# define training data as x and y (for convenience).
x = train_features
y = train_labels.values.ravel()

## Feature Selection and Classifier Options
Retrieve best method combinations from cross-validation results, and use this combination in the inner cross-validation loop to determine final hyperparameter values.

In [None]:
# Define feature selection methods and parameter search space.
# LASSO-lSVM based feature selection.
fs_selector = SelectFromModel(LinearSVC(penalty='l1', dual=False, max_iter=10000), threshold=-np.inf)       
fs_grid = {'selector__max_features':[50, 100, 150, 200], 'selector__estimator__C':np.logspace(-4, 1, 6)}

# Linear SVC classifier.
clf_model = SGDClassifier(loss='hinge', random_state=42, max_iter=10000)
clf_grid = {'classifier__alpha':np.logspace(-4, 1, 6)}

## Cross-Validation: Hyperparameter Selection

In [None]:
# Perform cross-validation on combination of methods.
def crossvalidation(fs_selector, fs_grid, clf_model, clf_grid, x, y):
    # Define searchgrid and pipeline.
    searchgrid = {**fs_grid, **clf_grid}
    steps = [('selector', fs_selector), ('classifier', clf_model)]
    pipeline = Pipeline(steps)
    
    # Define cross-validation to determine optimal hyperparameter combination.
    inner_cv = RepeatedKFold(n_splits=5, random_state=42, n_repeats=10)
    gridsearch = GridSearchCV(pipeline, searchgrid, scoring='accuracy', cv=inner_cv, refit=True, n_jobs=2)
    print("gridsearch: ", gridsearch)

    # Run cross-validation on training data.
    searchresult = gridsearch.fit(x,y)
    model = searchresult.best_estimator_

    # Retrieve 'best model' score and hyperparameters.
    best_score = searchresult.best_score_
    best_parameters = searchresult.best_params_
    
    return best_score, best_parameters  

In [None]:
cv_best_score, cv_best_parameters = crossvalidation(fs_selector, fs_grid, clf_model, clf_grid, x, y)
print(cv_best_score, cv_best_parameters)

gridsearch:  GridSearchCV(cv=RepeatedKFold(n_repeats=10, n_splits=5, random_state=42),
             estimator=Pipeline(steps=[('selector',
                                        SelectFromModel(estimator=LinearSVC(dual=False,
                                                                            max_iter=10000,
                                                                            penalty='l1'),
                                                        threshold=-inf)),
                                       ('classifier',
                                        SGDClassifier(max_iter=10000,
                                                      random_state=42))]),
             n_jobs=2,
             param_grid={'classifier__alpha': array([1.e-04, 1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01]),
                         'selector__estimator__C': array([1.e-04, 1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01]),
                         'selector__max_features': [50, 100, 150, 200]},
          

In [None]:
print(cv_best_score, cv_best_parameters)

0.8340000000000001 {'classifier__alpha': 0.1, 'selector__estimator__C': 0.1, 'selector__max_features': 50}


## Build Final Model

In [None]:
# Define feature selection methods and parameter search space.
# LASSO-lSVM-based feature selection.
fs_selector = SelectFromModel(
    LinearSVC(penalty='l1', dual=False, max_iter=10000, C=0.1), 
    threshold=-np.inf, max_features=50)       
clf_model = SGDClassifier(loss='hinge', random_state=42, max_iter=10000, alpha=0.1)

# Pipeline for method combination.
steps = [('selector', fs_selector), ('classifier', clf_model)]
model = Pipeline(steps)
model.fit(x, y)

Pipeline(steps=[('selector',
                 SelectFromModel(estimator=LinearSVC(C=0.1, dual=False,
                                                     max_iter=10000,
                                                     penalty='l1'),
                                 max_features=50, threshold=-inf)),
                ('classifier',
                 SGDClassifier(alpha=0.1, max_iter=10000, random_state=42))])

## Save final model

In [None]:
#joblib.dump(model, 'savemodel.pkl')

['model.pkl']

In [None]:
# load the model from disk
# model = joblib.load('savemodel.pkl')