In [1]:
# Install a pip package in the current Jupyter kernel
import sys
!{sys.executable} -m pip install scikit-learn



In [1]:
%%writefile ./libs/util_modelo_referencial_old.py
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.svm import SVC
import util_caracteristicas, util_fasta
from sklearn.preprocessing import StandardScaler
from sklearn.externals.joblib import dump, load
from sklearn.utils import shuffle

def crear_modelo_referencial(identificador, tuned_parameters, scores, n_jobs):
    #print("lectura de archivos fasta...")
    
    codigos_lncRNA = util_fasta.leer_fasta("./data/" + identificador + ".lncRNA.fasta")
    codigos_PCT = util_fasta.leer_fasta("./data/" + identificador + ".PCT.fasta")
    
    #print("levantamiento de features...")
    
    util_caracteristicas.generar_modelo_CPAT(identificador, codigos_lncRNA.keys(), codigos_PCT.keys())
    
    dict_features_lncRNA = util_caracteristicas.generar_caracteristicas(identificador, codigos_lncRNA)
    dict_features_PCT = util_caracteristicas.generar_caracteristicas(identificador, codigos_PCT)
    
    features_lncRNA = [list(x.values()) for x in dict_features_lncRNA.values()]
    features_PCT = [list(x.values()) for x in dict_features_PCT.values()]
    
    #print("inicio generación del modelo...")
    
    X = features_lncRNA + features_PCT
    y = ([1] * len(features_lncRNA)) + ([0] * len(features_PCT))
    #X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)
    X_train, y_train = shuffle(X, y, random_state=0)
    
    #feature_scaler = StandardScaler()
    #feature_scaler=load('./modelos_referenciales/feature_scaler_{}.bin'.format(identificador))
    #X_train = feature_scaler.fit_transform(X_train)  
    #X_test = feature_scaler.transform(X_test)
    #dump(feature_scaler, './modelos_referenciales/feature_scaler_{}.bin'.format(identificador), compress=True)
    
    for score in scores:
        #print("# Tuning hyper-parameters for %s" % score)
        #print()

        clf = GridSearchCV(SVC(), tuned_parameters, cv=10,
                           scoring=score, n_jobs=n_jobs, refit="accuracy")
        clf.fit(X_train, y_train)
        dump(clf.best_estimator_, './modelos_referenciales/modelo_{}.pkl'.format(identificador), compress = 1)
        #clf=load('./modelos_referenciales/{}.pkl'.format(identificador))

        #print("Best parameters set found on development set:")
        #print()
        #print(clf.best_params_)
        #print()
        #print("Grid scores on development set:")
        #print()
        
        resultado = {
            "accuracy" : clf.cv_results_['mean_test_accuracy'][clf.best_index_],
            "precision" : clf.cv_results_['mean_test_precision'][clf.best_index_],
            "recall" : clf.cv_results_['mean_test_recall'][clf.best_index_]
        }
        dump(resultado, './modelos_referenciales/resultado_{}.bin'.format(identificador))
        
        #means = clf.cv_results_['mean_test_accuracy']
        #stds = clf.cv_results_['std_test_accuracy']
        #for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        #    print("%0.3f (+/-%0.03f) for %r"
        #          % (mean, std * 2, params))
        #print()

        #print("Detailed classification report:")
        #print()
        #print("The model is trained on the full development set.")
        #print("The scores are computed on the full evaluation set.")
        #print()
        #y_true, y_pred = y_test, clf.predict(X_test)
        #print(classification_report(y_true, y_pred))
        #print()

Writing ./libs/util_modelo_referencial_old.py


In [1]:
%%writefile ./libs/util_modelo_referencial.py
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.svm import SVC
import util_caracteristicas, util_fasta
from sklearn.preprocessing import StandardScaler
from sklearn.externals.joblib import dump, load
from sklearn.utils import shuffle
import random

import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
import string
from tempfile import mkdtemp

class GeneradorFeatures(BaseEstimator, TransformerMixin):
    def __init__(self, identificador=None):
        self.identificador = identificador
        self.random_id = identificador + "_fold_" + ''.join(random.choices(string.ascii_uppercase + string.digits, k=5))
        
    def fit(self, X, y=None):
        identificador = self.random_id
        codigos_lncRNA = {}
        codigos_PCT = {}
        for i in range(len(X)):
            if y[i] == 0:
                codigos_PCT[X[i][0]] = X[i][1]
            else:
                codigos_lncRNA[X[i][0]] = X[i][1]
        util_caracteristicas.generar_modelo_CPAT(identificador, codigos_lncRNA, codigos_PCT)
        return self
        
    def transform(self, X):
        identificador = self.random_id
        codigos = {codigo[0]:codigo[1] for codigo in X}
        dict_features = util_caracteristicas.generar_caracteristicas(identificador, codigos)

        return [list(x.values()) for x in dict_features.values()]

def crear_modelo_referencial(identificador, tuned_parameters, scores, n_jobs):
    codigos_lncRNA = util_fasta.leer_fasta("./data/" + identificador + ".lncRNA.fasta")
    codigos_PCT = util_fasta.leer_fasta("./data/" + identificador + ".PCT.fasta")
    
    X = list(codigos_lncRNA.items()) + list(codigos_PCT.items())
    y = ([1] * len(codigos_lncRNA)) + ([0] * len(codigos_PCT))
    X_train, y_train = shuffle(X, y, random_state=0)
    cachedir = mkdtemp()
    svm_pipeline = Pipeline(steps=[('features', GeneradorFeatures(identificador)), ('svc', SVC())], memory=cachedir)
    
    for score in scores:
        clf = GridSearchCV(svm_pipeline, tuned_parameters, cv=10, scoring=score, n_jobs=n_jobs, refit="accuracy")
        clf.fit(X_train, y_train)
        resultado = {
            "accuracy" : clf.cv_results_['mean_test_accuracy'][clf.best_index_],
            "precision" : clf.cv_results_['mean_test_precision'][clf.best_index_],
            "recall" : clf.cv_results_['mean_test_recall'][clf.best_index_]
        }
        dump(resultado, './modelos_referenciales/resultado_{}.bin'.format(identificador))
        
        #means = clf.cv_results_['mean_test_accuracy']
        #stds = clf.cv_results_['std_test_accuracy']
        #for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        #    print("%0.3f (+/-%0.03f) for %r"
        #          % (mean, std * 2, params))
        #print()

        #print("Detailed classification report:")
        #print()
        #print("The model is trained on the full development set.")
        #print("The scores are computed on the full evaluation set.")
        #print()
        #y_true, y_pred = y_test, clf.predict(X_test)
        #print(classification_report(y_true, y_pred))
        #print()

Overwriting ./libs/util_modelo_referencial.py


In [3]:
%%time
import sys
sys.path.append('./libs')
import util_modelo_referencial
import warnings
warnings.filterwarnings("ignore")
from sklearn.externals.joblib import load

print("Iniciando proceso...")
identificador = "Especie2"
tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4], 'C': [1, 10, 100, 1000]},
                    {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]
scores = ['precision_macro', 'recall_macro', 'accuracy']
#2.43 tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3], 'C': [1]}]
#3.40 tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3], 'C': [10]}]
#13.00 tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3], 'C': [100]}]
#4.18 tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3], 'C': [1, 10]}] #n_jobs=None
#3.7 tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3], 'C': [1, 10]}] #n_jobs=-1
#8.34 tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3], 'C': [100]}]
#no_termina tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3], 'C': [1000]}]
#3.03 tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-4], 'C': [1, 10, 100]}]
#no_termina tuned_parameters = [{'kernel': ['linear'], 'C': [1, 10, 100]}]
#no_termina tuned_parameters = [{'kernel': ['linear'], 'C': [1]}]
#no_termina tuned_parameters = [{'kernel': ['linear'], 'C': [10]}]
#no_termina tuned_parameters = [{'kernel': ['linear'], 'C': [100]}]
#no_termina tuned_parameters = [{'kernel': ['linear'], 'C': [1000]}]
#
#25.16 tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3], 'C': [1,10,20,30,40,50,60,70,80,90,100]}]
#3.15 tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3], 'C': [0.1,0.5,0.9,1]}]
#best tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3], 'C': [0.1,1,2]}]
#3.00 tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-4,1e-5], 'C': [0.01,0.1,1,10]}]
#3.39 tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-5,1e-6], 'C': [10,15,20,100]}]
#tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-5], 'C': [60,80,100,120,140]}]
#tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-5], 'C': [900,1000,5000,10000]}]
tuned_parameters = [{'svc__kernel': ['rbf'], 'svc__gamma': [1e-3], 'svc__C': [0.1,0.5,0.9,2], 'features__identificador': [identificador]}]
scores = [['accuracy','precision','recall']]
util_modelo_referencial.crear_modelo_referencial(identificador, tuned_parameters, scores, n_jobs=-1)
print(load('./modelos_referenciales/resultado_{}.bin'.format(identificador)))

print("Proceso terminado...")

Iniciando proceso...
{'accuracy': 0.903041825095057, 'precision': 0.8570791181000229, 'recall': 0.967680608365019}
Proceso terminado...
CPU times: user 23.4 s, sys: 317 ms, total: 23.7 s
Wall time: 4min 36s
