In [1]:
import joblib
import numpy as np
import pandas as pd
import seaborn as sns
from scipy.stats import uniform, rv_discrete
from sklearn.model_selection import train_test_split

In [18]:
# models req import
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from xgboost import XGBClassifier

In [3]:
wax_data = pd.read_csv("../data/processed/wax_data.csv", index_col=0)
wax_K_data = pd.read_csv("../data/processed/wax_K_data.csv", index_col=0)

In [4]:
# Wax data

In [5]:
wax_data.head()

Unnamed: 0,5487157_55:C>T,3743550_33:A>C,3890504_12:A>G,3749835_12:T>A,5803165_43:G>C,3580391_37:C>T,3596752_10:G>A,5497838_52:C>G,3365135_13:C>G,3358883_23:T>C,...,3595103,3349314,3894206,3349326,3345945,3358741,3342391,3346044,3345296,wax
BK2,1,1,5,2,1,5,5,5,1,2,...,4,4,4,0,4,4,4,2,2,1
BK4,5,5,5,1,5,5,5,5,5,5,...,0,4,4,4,4,4,4,2,2,1
BK5,5,5,2,5,2,5,5,5,5,5,...,2,2,2,2,4,2,2,2,2,1
BK6,1,1,5,5,5,5,5,5,1,5,...,4,0,4,4,4,4,2,2,2,1
BK9,5,1,5,1,1,5,5,5,5,1,...,2,4,2,2,4,2,4,2,2,1


In [47]:
X = wax_data.drop("wax", axis=1)
y = wax_data["wax"]

In [48]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=40)

In [49]:
# Logistic Regression

In [50]:
lr = LogisticRegression(penalty="l1", solver="liblinear", random_state=101)
distributions = {"C": uniform(loc=0, scale=10)}
lr_sg = RandomizedSearchCV(lr, distributions, random_state=101, n_jobs=3, n_iter=100, cv=5)
lr_sg.fit(X_train, y_train)

RandomizedSearchCV(cv=5, error_score=nan,
                   estimator=LogisticRegression(C=1.0, class_weight=None,
                                                dual=False, fit_intercept=True,
                                                intercept_scaling=1,
                                                l1_ratio=None, max_iter=100,
                                                multi_class='auto', n_jobs=None,
                                                penalty='l1', random_state=101,
                                                solver='liblinear', tol=0.0001,
                                                verbose=0, warm_start=False),
                   iid='deprecated', n_iter=100, n_jobs=3,
                   param_distributions={'C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f3218619438>},
                   pre_dispatch='2*n_jobs', random_state=101, refit=True,
                   return_train_score=False, scoring=None, verbose=0)

In [51]:
lr_sg.best_params_

{'C': 5.163986277024462}

In [30]:
joblib.dump(lr_sg.best_estimator_, "../models/logistic_regression_wax")

['../models/logistic_regression_wax']

In [31]:
# Random forest

In [32]:
rf = RandomForestClassifier(random_state=101)
distributions = {"n_estimators": np.arange(20, 500, 1)}

rf_sg = RandomizedSearchCV(rf, distributions, random_state=101, n_jobs=3, n_iter=100, cv=5)
rf_sg.fit(X_train, y_train)

RandomizedSearchCV(cv=5, error_score=nan,
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    ccp_alpha=0.0,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    max_samples=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
               

In [52]:
rf_sg.best_params_

{'n_estimators': 148}

In [33]:
joblib.dump(rf_sg.best_estimator_, "../models/random_forest_wax")

['../models/random_forest_wax']

In [34]:
# XGBClassifier

In [54]:
xgb = XGBClassifier(random_state=101, reg_lambda=0, reg_alpha=1)
distributions = {"reg_alpha": uniform(loc=0, scale=10), "n_estimators": np.random.randint(20, 500, 1)}

xgb_sg = RandomizedSearchCV(xgb, distributions, random_state=101, n_jobs=3, n_iter=100, cv=5)
xgb_sg.fit(X_train, y_train)

RandomizedSearchCV(cv=5, error_score=nan,
                   estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                           colsample_bylevel=1,
                                           colsample_bynode=1,
                                           colsample_bytree=1, gamma=0,
                                           learning_rate=0.1, max_delta_step=0,
                                           max_depth=3, min_child_weight=1,
                                           missing=None, n_estimators=100,
                                           n_jobs=1, nthread=None,
                                           objective='binary:logistic',
                                           random_state=101, reg_alpha=1,
                                           reg_lambda=0, scale_pos_weight=1,
                                           seed=None, silent=None, subsample=1,
                                           verbosity=1),
                   iid='

In [56]:
xgb_sg.best_params_

{'n_estimators': 370, 'reg_alpha': 5.163986277024462}

In [57]:
joblib.dump(xgb_sg.best_estimator_, "../models/XGBoost_wax")

['../models/XGBoost_wax']

In [58]:
# Models metrics

In [63]:
lr_predicted = lr_sg.best_estimator_.predict(X_test)
rf_predicted = rf_sg.best_estimator_.predict(X_test)
xgb_predicted = xgb_sg.best_estimator_.predict(X_test)

In [64]:
# Logistic regression report
print(classification_report(y_test, lr_predicted))

              precision    recall  f1-score   support

           0       0.94      0.94      0.94        16
           1       0.98      0.98      0.98        58

    accuracy                           0.97        74
   macro avg       0.96      0.96      0.96        74
weighted avg       0.97      0.97      0.97        74



In [65]:
# Random forest report
print(classification_report(y_test, rf_predicted))

              precision    recall  f1-score   support

           0       0.93      0.88      0.90        16
           1       0.97      0.98      0.97        58

    accuracy                           0.96        74
   macro avg       0.95      0.93      0.94        74
weighted avg       0.96      0.96      0.96        74



In [66]:
# XGBClassifier report
print(classification_report(y_test, xgb_predicted))

              precision    recall  f1-score   support

           0       0.88      0.94      0.91        16
           1       0.98      0.97      0.97        58

    accuracy                           0.96        74
   macro avg       0.93      0.95      0.94        74
weighted avg       0.96      0.96      0.96        74

