In [1]:
# Modules will be reloaded every time a cell is executed
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
from pathlib import Path
from sklearn import utils
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import HistGradientBoostingClassifier, GradientBoostingClassifier, RandomForestClassifier 
from sklearn.metrics import classification_report, accuracy_score
# from sklearn.model_selection import RandomizedSearchCV
# from sklearn.metrics import accuracy_score

from funmirtar.models.seeds import SEED_TYPE_TO_EXTRACTION_FUNCTION
from funmirtar.models.constants import SEEDS_TO_COUNT, SEED_COUNT_COLUMNS, GLOBAL_FEATURES, LOCAL_FEATURES, CLASSIFICATION_COLUMNS
from funmirtar.utils.plots import plot_prc_with_seeds, plot_feature_importance
from funmirtar.utils.file import make_dir_with_parents, extend_path_by_suffix_before_filetype


In [3]:
utils.check_random_state(3)
np.random.seed(1)

In [4]:
# RUN_NAME = 'seeds.signal.local_features.model_optimisation'
RUN_NAME = 'seeds.signal.local_features'
# RUN_NAME = 'seeds.signal'
# RUN_NAME = 'seeds'

In [5]:
MODEL_LIST = [
    f'logistic_regression.{RUN_NAME}',
    f'gradient_boosting_classifier.{RUN_NAME}',
    f'xgb.{RUN_NAME}',
    f'random_forest.{RUN_NAME}',    
]

# MODEL_LIST = [
#     f'random_forest.default.{RUN_NAME}',    
#     f'random_forest.optimised.{RUN_NAME}',    
# ]

OUT_COLUMNS = []
OUT_COLUMNS.extend(CLASSIFICATION_COLUMNS)
OUT_COLUMNS.extend(MODEL_LIST)

In [6]:
COLUMNS_FOR_PRED = []
COLUMNS_FOR_PRED.extend(GLOBAL_FEATURES)
COLUMNS_FOR_PRED.extend(LOCAL_FEATURES)
COLUMNS_FOR_PRED.extend(SEEDS_TO_COUNT)
COLUMNS_FOR_PRED.extend(SEED_COUNT_COLUMNS)

In [7]:
IN_FOLDER_PATH = "../data/preprocessed_for_ml/"

# FILE_PATH = '3utr.sequences.refseq_id.mirna_fc.seed_cls.sequence.signal.conservation.seed_cls.ts_preds'
FILE_PATH = '3utr.sequences.refseq_id.mirna_fc.seed_cls.sequence.signal.conservation.seed_cls.ts_preds.local_features'

IN_FEATURES_PATH_TRAIN = Path(
    IN_FOLDER_PATH + FILE_PATH + '.train.pkl'
)
IN_FEATURES_PATH_TEST = Path(
    IN_FOLDER_PATH + FILE_PATH + '.test.pkl'
)

IN_FEATURES_PATH_TRAIN, IN_FEATURES_PATH_TEST

(PosixPath('../data/preprocessed_for_ml/3utr.sequences.refseq_id.mirna_fc.seed_cls.sequence.signal.conservation.seed_cls.ts_preds.local_features.train.pkl'),
 PosixPath('../data/preprocessed_for_ml/3utr.sequences.refseq_id.mirna_fc.seed_cls.sequence.signal.conservation.seed_cls.ts_preds.local_features.test.pkl'))

In [8]:
OUT_FOLDER_PATH = f"../data/predictions/{RUN_NAME}/"

PREDICTION_TYPE = '.class_preds'

OUT_PATH_TRAIN = Path(
    OUT_FOLDER_PATH + FILE_PATH + PREDICTION_TYPE + '.train.pkl'
)
OUT_PATH_TEST = Path(
    OUT_FOLDER_PATH + FILE_PATH + PREDICTION_TYPE + '.test.pkl'
)
OUT_PATH_TRAIN, OUT_PATH_TEST

(PosixPath('../data/predictions/seeds.signal.local_features/3utr.sequences.refseq_id.mirna_fc.seed_cls.sequence.signal.conservation.seed_cls.ts_preds.local_features.class_preds.train.pkl'),
 PosixPath('../data/predictions/seeds.signal.local_features/3utr.sequences.refseq_id.mirna_fc.seed_cls.sequence.signal.conservation.seed_cls.ts_preds.local_features.class_preds.test.pkl'))

In [9]:
make_dir_with_parents(OUT_FOLDER_PATH)

PosixPath('../data/predictions/seeds.signal.local_features')

In [10]:
data_train = pd.read_pickle(IN_FEATURES_PATH_TRAIN)
data_test = pd.read_pickle(IN_FEATURES_PATH_TEST)

In [None]:
# HOTFIX until we regenerate new data file with corrected naming
data_train.rename(columns={'kmer6_mismatch_count':'kmer6_bulge_or_mismatch_count'}, inplace=True)
data_test.rename(columns={'kmer6_mismatch_count':'kmer6_bulge_or_mismatch_count'}, inplace=True)

In [None]:
x_train = data_train[COLUMNS_FOR_PRED].fillna(0,inplace=False)
x_test = data_test[COLUMNS_FOR_PRED].fillna(0,inplace=False)

In [None]:
y_train = data_train.label
y_test = data_test.label

### Train models

#### Logistic regression

In [None]:
model_lr = LogisticRegression(max_iter=10000)
model_lr.fit(x_train, y_train)

y_pred_lr_train = model_lr.predict_proba(x_train)
y_pred_lr_test = model_lr.predict_proba(x_test)

data_train[f'logistic_regression.{RUN_NAME}']=y_pred_lr_train[:,1]
data_test[f'logistic_regression.{RUN_NAME}']=y_pred_lr_test[:,1]

#### (Histogram) Gradient Boosting Classifier

In [None]:
model_grad = GradientBoostingClassifier()
model_grad.fit(x_train, y_train)

y_pred_grad_train = model_grad.predict_proba(x_train)
y_pred_grad_test = model_grad.predict_proba(x_test)

data_train[f'gradient_boosting_classifier.{RUN_NAME}'] = y_pred_grad_train[:,1]
data_test[f'gradient_boosting_classifier.{RUN_NAME}'] = y_pred_grad_test[:,1]

In [None]:
xgb_classifier = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb_classifier.fit(x_train, y_train)

# Make predictions with XGBOOST
xgb_y_pred_class = xgb_classifier.predict(x_test)
xgb_y_pred_test = xgb_classifier.predict_proba(x_test)

xgb_y_pred_train = xgb_classifier.predict_proba(x_train)

# Evaluate the XGBOOST classifier
xgb_accuracy = accuracy_score(y_test, xgb_y_pred_class)
xgb_report = classification_report(y_test, xgb_y_pred_class)

print(f'Accuracy (XGBoost): {xgb_accuracy}')
print('Classification Report (XGBoost):')
print(xgb_report)

data_test[f'xgb.{RUN_NAME}'] = xgb_y_pred_test[:,1]
data_train[f'xgb.{RUN_NAME}'] = xgb_y_pred_train[:,1]

In [None]:
RUN_NAME# best 250, 20, 20, 10, sqrt
# 250, 20, 100, 40, sqrt
# 100, None, 200, 40, 0.5, 
rf_classifier = RandomForestClassifier(
    n_estimators=250, # 100, 150, 200, 250
    max_depth=20, # None, 10, 20
    min_samples_split=100,  # 2, 5, 10, 20, 100, 150
    min_samples_leaf=40, # 1, 2, 4, 6, 10, 40, 60
    max_features='sqrt', # 'auto', 'sqrt', 'log2', float 
    random_state=42,
)
rf_classifier.fit(x_train, y_train)

# Make predictions
rf_y_pred_class = rf_classifier.predict(x_test)
rf_y_pred = rf_classifier.predict_proba(x_test)

rf_y_pred_train = rf_classifier.predict_proba(x_train)

# Evaluate the classifier
rf_accuracy = accuracy_score(y_test, rf_y_pred_class)
rf_report = classification_report(y_test, rf_y_pred_class)

print(f'Accuracy (Random Forest): {rf_accuracy}')
print('Classification Report (Random Forest):')
print(rf_report)

data_test[f'random_forest.{RUN_NAME}'] = rf_y_pred[:,1]
data_train[f'random_forest.{RUN_NAME}'] = rf_y_pred_train[:,1]

In [None]:
# # Define the parameter grid for hyperparameter tuning
# # Best Parameters: {'n_estimators': 100, 'min_samples_split': 200, 'min_samples_leaf': 40, 'max_features': 0.5, 'max_depth': None}
# param_dist = {
#     'n_estimators': [100, 175, 250],
#     'max_depth': [None, 10, 20],
#     'min_samples_split': [100, 200],
#     'min_samples_leaf': [40, 80],
#     'max_features': ['sqrt', 'log2', 0.5, None]
# }

# # Initialize a RandomForestClassifier
# rf_classifier = RandomForestClassifier(random_state=42)

# # Use RandomizedSearchCV to find the best hyperparameters
# # random_search = HalvingRandomSearchCV(...)
# random_search = RandomizedSearchCV(estimator=rf_classifier, param_distributions=param_dist,
#                                    n_iter=25, cv=3, n_jobs=-1, random_state=42, verbose=2)
# random_search.fit(x_seeds_signal_train, y_train)

# # Get the best model
# best_rf_classifier = random_search.best_estimator_

# # Fit the best model
# best_rf_classifier.fit(x_seeds_signal_train, y_train)

# # Make predictions
# rf_y_pred_class = best_rf_classifier.predict(x_seeds_signal_test)
# rf_y_pred = best_rf_classifier.predict_proba(x_seeds_signal_test)

# rf_y_pred_train = best_rf_classifier.predict_proba(x_seeds_signal_train)

# # Evaluate the classifier
# rf_accuracy = accuracy_score(y_test, rf_y_pred_class)
# rf_report = classification_report(y_test, rf_y_pred_class)

# print(f'Best Parameters: {random_search.best_params_}')
# print(f'Accuracy (Random Forest): {rf_accuracy}')
# print('Classification Report (Random Forest):')
# print(rf_report)

# data_test[f'random_forest{RUN_NAME}'] = rf_y_pred[:, 1]
# data_train[f'random_forest{RUN_NAME}'] = rf_y_pred_train[:, 1]


In [None]:
feature_importances = rf_classifier.feature_importances_
feature_names = x_train.columns
plot_feature_importance(feature_names, feature_importances, 40, OUT_FOLDER_PATH + 'features')

In [None]:
data_train[OUT_COLUMNS].to_pickle(OUT_PATH_TRAIN)
data_test[OUT_COLUMNS].to_pickle(OUT_PATH_TEST)