# Imports

In [1]:
# General Imports
import pandas as pd
import numpy as np
from modify_dataset import load_from_pickle, load_to_pickle

# Plotting libraries
import plotly.express as px

# Dimensionality reduction
from sklearn.decomposition import PCA

# Model selection
from sklearn.model_selection import permutation_test_score

# Scalers
from sklearn.preprocessing import StandardScaler

# Metrics
from sklearn.metrics import make_scorer
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, matthews_corrcoef
from sklearn.metrics import mean_absolute_error, r2_score

# Classification Models
from sklearn.dummy import DummyClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

# Regression Models
from sklearn.dummy import DummyRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

# Error Analysis
# import eli5
# from lime.lime_tabular import LimeTabularExplainer

# Other Model Stuff
from joblib import dump, load
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

# Training & Test Sets

In [2]:
feature_selection_columns_dd_psd_mf = np.load("Dataset_Files/Feature_Selection/features_dd_psd_mf_list.npy",
                                              allow_pickle=True)
feature_selection_columns_dd_psd = np.load("Dataset_Files/Feature_Selection/features_dd_psd_list.npy",
                                           allow_pickle=True)

In [3]:
X_train_dd_psd_mf_feature_selection = np.load(
    "Dataset_Files/Training_Test_Sets/X_train_dd_psd_mf_feature_selection.npy")
y_train_dd_psd_mf = np.load(
    "Dataset_Files/Training_Test_Sets/y_train_dd_psd_mf.npy")

X_train_dd_psd_feature_selection = np.load(
    "Dataset_Files/Training_Test_Sets/X_train_dd_psd_feature_selection.npy")
y_train_dd_psd = np.load(
    "Dataset_Files/Training_Test_Sets/y_train_dd_psd.npy")

X_test_classification_dd_psd_mf_feature_selection = np.load(
    "Dataset_Files/Training_Test_Sets/X_test_classification_dd_psd_mf_feature_selection.npy")
X_test_classification_dd_psd_feature_selection = np.load(
    "Dataset_Files/Training_Test_Sets/X_test_classification_dd_psd_feature_selection.npy")
y_test_classification = np.load(
    "Dataset_Files/Training_Test_Sets/y_test_classification.npy")


In [4]:
# Useful Information & Sanity Checks
print(f"X_train_molecular_functions shape: {X_train_dd_psd_mf_feature_selection.shape}")
print(f"y_train_molecular_functions shape: {y_train_dd_psd_mf.shape}")
print(f"X_train_without_molecular_functions shape: {X_train_dd_psd_feature_selection.shape}")
print(f"y_train_without_molecular_functions shape: {y_train_dd_psd.shape[0]}")
print(f"X_test_classification_molecular_functions shape: "
      f"{X_test_classification_dd_psd_mf_feature_selection.shape}")
print(
    f"X_test_classification_without_molecular_functions shape: "
    f"{X_test_classification_dd_psd_feature_selection.shape}")
print(
    f"y_test_classification shape: {y_test_classification.shape[0]} (Binding Count: {y_test_classification[y_test_classification == 1].shape[0]}, Non-Binding Count: {y_test_classification[y_test_classification == 0].shape[0]})")

X_train_molecular_functions shape: (120561, 1045)
y_train_molecular_functions shape: (120561,)
X_train_without_molecular_functions shape: (134734, 1044)
y_train_without_molecular_functions shape: 134734
X_test_classification_molecular_functions shape: (30141, 1045)
X_test_classification_without_molecular_functions shape: (30141, 1044)
y_test_classification shape: 30141 (Binding Count: 22001, Non-Binding Count: 8140)


# Helper Functions

In [5]:
# Called when gridsearch is not used
def calculate_metrics_classification(y_true, y_pred):
    recall = recall_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    accuracy = accuracy_score(y_true, y_pred)
    mcc = matthews_corrcoef(y_true, y_pred)

    print(f"F1 score: {f1}")
    print(f"Matthews Correlation Coefficient: {mcc}")
    print(f"Accuracy score: {accuracy}")
    print(f"Recall score: {recall}")
    print(f"Precision score: {precision}")


# Called when gridsearch is used
def best_model_information_classification(model):
    print(f"Best model index: {model.best_index_}")
    print(f"Best parameters: {model.best_params_}")
    print(f"Best F1 score: {model.best_score_}")
    print(f"Matthews Correlation Coefficient: {model.cv_results_['mean_test_mcc'][model.best_index_]}")
    print(f"Accuracy score: {model.cv_results_['mean_test_accuracy'][model.best_index_]}")
    print(f"Recall score: {model.cv_results_['mean_test_recall'][model.best_index_]}")
    print(f"Precision score: {model.cv_results_['mean_test_precision'][model.best_index_]}")


# The common p-value of 0.0099009900990099011 is returned when no model with the shuffled labels produces a better score
# https://github.com/scikit-learn/scikit-learn/issues/8379
def permutation_test_classification(model, classification_group, repetitions=100):
    if classification_group == "dd_psd":
        X_train = X_train_dd_psd_feature_selection
        y_train = y_train_dd_psd
    elif classification_group == "dd_psd_mf":
        X_train = X_train_dd_psd_mf_feature_selection
        y_train = y_train_dd_psd_mf
    else:
        raise ValueError("Invalid group. Please choose 'dd_psd' or 'dd_psd_mf'")

    score, perm_scores, pvalue = permutation_test_score(model,
                                                        X_train,
                                                        y_train,
                                                        scoring=make_scorer(matthews_corrcoef),
                                                        cv=10,
                                                        n_permutations=repetitions,
                                                        n_jobs=-1,
                                                        random_state=42,
                                                        )

    permutation_test_dataframe = pd.DataFrame(data={"score": score,
                                                    "perm_scores": perm_scores,
                                                    "pvalue": pvalue}
                                              )

    fig = px.histogram(permutation_test_dataframe,
                       x="perm_scores",
                       histnorm='probability density',
                       labels={'probability density': 'Probability Density',
                               'perm_scores': 'Perm Scores'}
                       )
    fig.update_xaxes(range=[0, 1], dtick=0.1)
    fig.add_vline(x=score,
                  line_dash='dash',
                  line_color='firebrick',
                  annotation_text=f"Original Data MCC: {round(score, 4)} <br> (P-Value: {round(pvalue, 6)})",
                  annotation_position="top right",
                  annotation_font_size=14)
    fig.show()


def prediction_category_classification(df):
    if df['True Class'] == df['Prediction']:
        return 'Correct'
    elif (df['True Class'] == 0) and (df['Prediction'] == 1):
        return 'False Positive'
    else:
        return 'False Negative'


def error_analysis_classification(y_pred, classification_group):
    if classification_group == 'dd_psd':
        X_set = pd.DataFrame(X_test_classification_dd_psd_feature_selection, columns=feature_selection_columns_dd_psd)
        y_set = pd.Series(y_test_classification, name="Activity_Binary")

    elif classification_group == 'dd_psd_mf':
        X_set = pd.DataFrame(X_test_classification_dd_psd_mf_feature_selection,
                             columns=feature_selection_columns_dd_psd_mf)
        y_set = pd.Series(y_test_classification, name="Activity_Binary")
    else:
        raise ValueError("Invalid group. Please choose 'dd_psd' or 'dd_psd_mf'")

    # Combining data into one dataframe
    y_pred_series = pd.Series(y_pred, index=y_set.index)

    error_analysis_dataframe = pd.concat([X_set, y_set], axis=1)
    error_analysis_dataframe = pd.concat([error_analysis_dataframe, y_pred_series], axis=1)
    error_analysis_dataframe.rename(columns={"Activity_Binary": "True Class", 0: "Prediction"}, inplace=True)
    error_analysis_dataframe["Is the prediction correct?"] = error_analysis_dataframe.apply(
        prediction_category_classification, axis=1)

    # Scaling
    scaler = StandardScaler()
    scaler.fit(error_analysis_dataframe.loc[:, feature_selection_columns_dd_psd])
    scaled_data = scaler.transform(error_analysis_dataframe.loc[:, feature_selection_columns_dd_psd])

    # PCA
    pca = PCA(n_components=2, random_state=0)
    pca.fit(scaled_data)
    pca_data = pca.transform(scaled_data)
    pca_dataframe_2d = pd.DataFrame(pca_data, columns=["PCA_Dimension_1", "PCA_Dimension_2"], index=y_set.index)

    # Joining dataframes
    error_analysis_dataframe = pd.concat([error_analysis_dataframe, pca_dataframe_2d], axis=1)

    # Plot
    fig = px.scatter(error_analysis_dataframe, x="PCA_Dimension_1", y="PCA_Dimension_2",
                     color="Is the prediction correct?",
                     symbol="Is the prediction correct?",
                     hover_data=['MW', 'TPSA', 'XLogP', 'NHD', 'NHA', 'NRB', 'True Class', 'Prediction'],
                     title="Correct Classifications vs Misclassifications")
    fig.show()

    # Useful stats
    print(
        f"Number of correct classifications: {len(error_analysis_dataframe[error_analysis_dataframe['Is the prediction correct?'] == 'Correct'])}")
    print(
        f"Number of misclassifications: {len(error_analysis_dataframe[error_analysis_dataframe['Is the prediction correct?'] != 'Correct'])}")
    print(
        f"False Positives (True class:0, Prediction:1): {len(error_analysis_dataframe[(error_analysis_dataframe['True Class'] == 0) & (error_analysis_dataframe['Prediction'] == 1)])}")
    print(
        f"False Negatives (True class:1, Prediction:0): {len(error_analysis_dataframe[(error_analysis_dataframe['True Class'] == 1) & (error_analysis_dataframe['Prediction'] == 0)])}")

    return error_analysis_dataframe.sort_values('Is the prediction correct?')


# def model_weights_classification(model, classification_group):
#     if classification_group == 'cd':
#         return eli5.show_weights(model,
#                                  feature_names=X_train_cd.columns,
#                                  target_names={1:"BBB+",0:"BBB-"})
#     elif classification_group == 'cd_se_i':
#         return eli5.show_weights(model,
#                                  feature_names=X_train_cd_se_i.loc[:,feature_selection_support].columns,
#                                  target_names={1:"BBB+",0:"BBB-"})
#     else:
#         raise ValueError("Invalid group. Please choose 'cd' or 'cd_se_i'")

# def get_lime_explainer_classification(classification_group):
#     if classification_group == 'cd':
#         X_train = X_train_cd
#         y_train = y_train_cd
#     elif classification_group == 'cd_se_i':
#         X_train = X_train_cd_se_i.loc[:,feature_selection_support]
#         y_train = y_train_cd_se_i
#     else:
#         raise ValueError("Invalid group. Please choose 'cd' or 'cd_se_i'")
#
#     explainer = LimeTabularExplainer(training_data=np.array(X_train),
#                                      mode='classification',
#                                      feature_names=list(X_train.columns),
#                                      training_labels=y_train,
#                                      class_names=['BBB-','BBB+'],
#                                      random_state=42)
#     return explainer

grid_search_scoring_dict = {
    'precision': make_scorer(precision_score),
    'recall': make_scorer(recall_score),
    'f1': make_scorer(f1_score),
    'accuracy': make_scorer(accuracy_score),
    'mcc': make_scorer(matthews_corrcoef),
}

# Do Molecular Functions Improve our Predictive Performance?

## Dummy Classifier (DC)

In [6]:
dummy_classifier = Pipeline(
    [
        ('scale', StandardScaler()),
        ('model', DummyClassifier(random_state=42))
    ]
)
dummy_classifier.get_params()

{'memory': None,
 'steps': [('scale', StandardScaler()),
  ('model', DummyClassifier(random_state=42))],
 'verbose': False,
 'scale': StandardScaler(),
 'model': DummyClassifier(random_state=42),
 'scale__copy': True,
 'scale__with_mean': True,
 'scale__with_std': True,
 'model__constant': None,
 'model__random_state': 42,
 'model__strategy': 'prior'}

### Drug Descriptors, Protein Sequence Descriptors

#### Training

In [7]:
dummy_classifier.fit(X_train_dd_psd_feature_selection, y_train_dd_psd)

y_train_pred = dummy_classifier.predict(X_train_dd_psd_feature_selection)
calculate_metrics_classification(y_train_dd_psd, y_train_pred)

# Save Model
dump(dummy_classifier, 'Dataset_Files/Classification_Models/dc_dd_psd.joblib')

F1 score: 0.8459466288646107
Matthews Correlation Coefficient: 0.0
Accuracy score: 0.7330221028099811
Recall score: 1.0
Precision score: 0.7330221028099811


['Dataset_Files/Classification_Models/dc_dd_psd.joblib']

#### Testing

In [8]:
# Load Model
dummy_classifier = load('Dataset_Files/Classification_Models/dc_dd_psd.joblib')

y_test_pred = dummy_classifier.predict(X_test_classification_dd_psd_feature_selection)
calculate_metrics_classification(y_test_classification, y_test_pred)

F1 score: 0.8438878447316942
Matthews Correlation Coefficient: 0.0
Accuracy score: 0.729935967618858
Recall score: 1.0
Precision score: 0.729935967618858


### Drug Descriptors, Protein Sequence Descriptors, Molecular Functions

#### Training

In [9]:
dummy_classifier.fit(X_train_dd_psd_mf_feature_selection, y_train_dd_psd_mf)

y_train_pred = dummy_classifier.predict(X_train_dd_psd_mf_feature_selection)
calculate_metrics_classification(y_train_dd_psd_mf, y_train_pred)

# Save Model
dump(dummy_classifier, 'Dataset_Files/Classification_Models/dc_dd_psd_mf.joblib')

F1 score: 0.8438944400759479
Matthews Correlation Coefficient: 0.0
Accuracy score: 0.7299458365474739
Recall score: 1.0
Precision score: 0.7299458365474739


['Dataset_Files/Classification_Models/dc_dd_psd_mf.joblib']

#### Testing

In [10]:
# Load Model
dummy_classifier = load('Dataset_Files/Classification_Models/dc_dd_psd_mf.joblib')

y_test_pred = dummy_classifier.predict(X_test_classification_dd_psd_mf_feature_selection)
calculate_metrics_classification(y_test_classification, y_test_pred)

F1 score: 0.8438878447316942
Matthews Correlation Coefficient: 0.0
Accuracy score: 0.729935967618858
Recall score: 1.0
Precision score: 0.729935967618858


## Logistic Regression (LR)

In [11]:
pipe = Pipeline(
    [
        ('scale', StandardScaler()),
        ('model', LogisticRegression(random_state=42))
    ]
)
pipe.get_params()

{'memory': None,
 'steps': [('scale', StandardScaler()),
  ('model', LogisticRegression(random_state=42))],
 'verbose': False,
 'scale': StandardScaler(),
 'model': LogisticRegression(random_state=42),
 'scale__copy': True,
 'scale__with_mean': True,
 'scale__with_std': True,
 'model__C': 1.0,
 'model__class_weight': None,
 'model__dual': False,
 'model__fit_intercept': True,
 'model__intercept_scaling': 1,
 'model__l1_ratio': None,
 'model__max_iter': 100,
 'model__multi_class': 'auto',
 'model__n_jobs': None,
 'model__penalty': 'l2',
 'model__random_state': 42,
 'model__solver': 'lbfgs',
 'model__tol': 0.0001,
 'model__verbose': 0,
 'model__warm_start': False}

In [12]:
model = GridSearchCV(estimator=pipe,
                     param_grid={
                         'model__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
                         'model__penalty': ['none', 'l2', 'l1', 'elasticnet'],
                         'model__max_iter': [50, 100, 200, 400, 600, 800, 1000, 2000, 4000, 8000],
                         'model__class_weight': [None, "balanced"],
                     },
                     scoring=grid_search_scoring_dict,
                     refit='f1',  # Optimise for F1 Score
                     return_train_score=False,
                     cv=10,
                     n_jobs=-1,
                     verbose=1)

### Drug Descriptors, Protein Sequence Descriptors

#### Training

In [None]:
model.fit(X_train_dd_psd_feature_selection, y_train_dd_psd)

optimised_lr_dd_psd = model.best_estimator_
best_model_information_classification(model)

# Save Model & CV Results
dump(optimised_lr_dd_psd, 'Classification_Models/optimised_lr_dd_psd.joblib')
np.save("Dataset_Files/Classification_Models/optimised_lr_dd_psd_cv_results", model.cv_results_)

Fitting 10 folds for each of 400 candidates, totalling 4000 fits


In [None]:
logistic_regression_grid_search_dataframe = pd.DataFrame(
    np.load("Dataset_Files/Classification_Models/optimised_lr_dd_psd_cv_results").tolist())
logistic_regression_grid_search_dataframe.sort_values(by=["rank_test_f1"], inplace=True)
logistic_regression_grid_search_dataframe

#### Permutation Testing

In [None]:
# Load Model
optimised_lr_dd_psd = load('Dataset_Files/Classification_Models/optimised_lr_dd_psd.joblib')
permutation_test_classification(optimised_lr_dd_psd, "dd_psd")

#### Testing

In [None]:
# Load Model
optimised_lr_dd_psd = load('Dataset_Files/Classification_Models/optimised_lr_dd_psd.joblib')

y_test_pred = optimised_lr_dd_psd.predict(X_test_classification_dd_psd_feature_selection)
calculate_metrics_classification(y_test_classification, y_test_pred)

### Drug Descriptors, Protein Sequence Descriptors, Molecular Functions

#### Training

In [None]:
model.fit(X_train_dd_psd_mf_feature_selection, y_train_dd_psd_mf)

optimised_lr_dd_psd_mf = model.best_estimator_
best_model_information_classification(model)

# Save Model & CV Results
dump(optimised_lr_dd_psd_mf, 'Classification_Models/optimised_lr_dd_psd_mf.joblib')
np.save("Dataset_Files/Classification_Models/optimised_lr_dd_psd_mf_cv_results", model.cv_results_)

In [None]:
logistic_regression_grid_search_dataframe = pd.DataFrame(
    np.load("Dataset_Files/Classification_Models/optimised_lr_dd_psd_mf_cv_results").tolist())
logistic_regression_grid_search_dataframe.sort_values(by=["rank_test_f1"], inplace=True)
logistic_regression_grid_search_dataframe

#### Permutation Testing

In [None]:
# Load Model
optimised_lr_dd_psd_mf = load('Dataset_Files/Classification_Models/optimised_lr_dd_psd_mf.joblib')
permutation_test_classification(optimised_lr_dd_psd_mf, "dd_psd_mf")

#### Testing

In [None]:
# Load Model
optimised_lr_dd_psd_mf = load('Dataset_Files/Classification_Models/optimised_lr_dd_psd_mf.joblib')

y_test_pred = optimised_lr_dd_psd_mf.predict(X_test_classification_dd_psd_mf_feature_selection)
calculate_metrics_classification(y_test_classification, y_test_pred)

## Support Vector Classification (SVC)

In [None]:
pipe = Pipeline(
    [
        ('scale', StandardScaler()),
        ('model', SVC(random_state=42))
    ]
)
pipe.get_params()

In [None]:
model = GridSearchCV(estimator=pipe,
                     param_grid={
                         'model__kernel': ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed'],
                         'model__degree': [3, 4, 5, 6],
                         'model__gamma': ['scale', 'auto'],
                         'model__class_weight': [None, "balanced"],
                     },
                     scoring=grid_search_scoring_dict,
                     refit='f1',  # Optimise for F1 Score
                     return_train_score=False,
                     cv=10,
                     n_jobs=-1)

### Drug Descriptors, Protein Sequence Descriptors

#### Training

In [None]:
model.fit(X_train_dd_psd_feature_selection, y_train_dd_psd)

optimised_svc_dd_psd = model.best_estimator_
best_model_information_classification(model)

# Save Model & CV Results
dump(optimised_svc_dd_psd, 'Classification_Models/optimised_svc_dd_psd.joblib')
np.save("Dataset_Files/Classification_Models/optimised_svc_dd_psd_cv_results", model.cv_results_)

In [None]:
svc_grid_search_dataframe = pd.DataFrame(
    np.load("Dataset_Files/Classification_Models/optimised_svc_dd_psd_cv_results").tolist())
svc_grid_search_dataframe.sort_values(by=["rank_test_f1"], inplace=True)
svc_grid_search_dataframe

#### Permutation Testing

In [None]:
# Load Model
optimised_svc_dd_psd = load('Dataset_Files/Classification_Models/optimised_svc_dd_psd.joblib')
permutation_test_classification(optimised_svc_dd_psd, "dd_psd")

#### Testing

In [None]:
# Load Model
optimised_svc_dd_psd = load('Dataset_Files/Classification_Models/optimised_svc_dd_psd.joblib')

y_test_pred = optimised_svc_dd_psd.predict(X_test_classification_dd_psd_feature_selection)
calculate_metrics_classification(y_test_classification, y_test_pred)

### Drug Descriptors, Protein Sequence Descriptors, Molecular Functions

#### Training

In [None]:
model.fit(X_train_dd_psd_mf_feature_selection, y_train_dd_psd_mf)

optimised_svc_dd_psd_mf = model.best_estimator_
best_model_information_classification(model)

# Save Model & CV Results
dump(optimised_svc_dd_psd_mf, 'Classification_Models/optimised_svc_dd_psd_mf.joblib')
np.save("Dataset_Files/Classification_Models/optimised_svc_dd_psd_mf_cv_results", model.cv_results_)

In [None]:
svc_grid_search_dataframe = pd.DataFrame(
    np.load("Dataset_Files/Classification_Models/optimised_svc_dd_psd_mf_cv_results").tolist())
svc_grid_search_dataframe.sort_values(by=["rank_test_f1"], inplace=True)
svc_grid_search_dataframe

#### Permutation Testing

In [None]:
# Load Model
optimised_svc_dd_psd_mf = load('Dataset_Files/Classification_Models/optimised_svc_dd_psd_mf.joblib')
permutation_test_classification(optimised_svc_dd_psd_mf, "dd_psd_mf")

#### Testing

In [None]:
# Load Model
optimised_svc_dd_psd_mf = load('Dataset_Files/Classification_Models/optimised_svc_dd_psd_mf.joblib')

y_test_pred = optimised_svc_dd_psd_mf.predict(X_test_classification_dd_psd_mf_feature_selection)
calculate_metrics_classification(y_test_classification, y_test_pred)

## Random Forest Classifier (RFC)

In [5]:
pipe = Pipeline(
    [
        ('scale', StandardScaler()),
        ('model', RandomForestClassifier(random_state=42))
    ]
)
pipe.get_params()

{'memory': None,
 'steps': [('scale', StandardScaler()),
  ('model', RandomForestClassifier(random_state=42))],
 'verbose': False,
 'scale': StandardScaler(),
 'model': RandomForestClassifier(random_state=42),
 'scale__copy': True,
 'scale__with_mean': True,
 'scale__with_std': True,
 'model__bootstrap': True,
 'model__ccp_alpha': 0.0,
 'model__class_weight': None,
 'model__criterion': 'gini',
 'model__max_depth': None,
 'model__max_features': 'sqrt',
 'model__max_leaf_nodes': None,
 'model__max_samples': None,
 'model__min_impurity_decrease': 0.0,
 'model__min_samples_leaf': 1,
 'model__min_samples_split': 2,
 'model__min_weight_fraction_leaf': 0.0,
 'model__n_estimators': 100,
 'model__n_jobs': None,
 'model__oob_score': False,
 'model__random_state': 42,
 'model__verbose': 0,
 'model__warm_start': False}

In [6]:
model = GridSearchCV(estimator=pipe,
                     param_grid={
                         'model__n_estimators': [100, 200, 400, 600, 800],
                         'model__criterion': ['gini', 'entropy', 'log_loss'],
                         'model__max_features': [None, 'sqrt', 'log2'],
                         'model__class_weight': [None, 'balanced', 'balanced_subsample'],
                     },
                     scoring=grid_search_scoring_dict,
                     refit='f1',  # Optimise for F1 Score
                     return_train_score=False,
                     cv=10,
                     n_jobs=-1,
                     verbose=2)

### Without Molecular Functions

In [None]:
model.fit(X_train_without_molecular_functions, y_train_without_molecular_functions)

optimised_rfc_without_molecular_functions = model.best_estimator_
best_model_information_classification(model)

# Save Model
dump(optimised_rfc_without_molecular_functions,
     'Classification_Models/optimised_rfc_without_molecular_functions.joblib')

Fitting 10 folds for each of 135 candidates, totalling 1350 fits


In [None]:
rfc_grid_search_dataframe = pd.DataFrame(model.cv_results_)
rfc_grid_search_dataframe.sort_values(by=["rank_test_f1"], inplace=True)
load_to_pickle(rfc_grid_search_dataframe, "rfc_grid_search_dataframe_without_molecular_functions")
rfc_grid_search_dataframe

#### Testing Without Feature Selection

In [None]:
optimised_rfc_without_molecular_functions = load(
    'Classification_Models/optimised_rfc_without_molecular_functions.joblib')

y_test_pred = optimised_rfc_without_molecular_functions.predict(X_test_classification.loc[:, "MW":"NRB"])
calculate_metrics_classification(y_test_classification, y_test_pred)

### With Molecular Functions

In [None]:
model.fit(X_train_molecular_functions, y_train_molecular_functions)

optimised_rfc_without_molecular_functions = model.best_estimator_
best_model_information_classification(model)

# Save Model
dump(optimised_rfc_without_molecular_functions,
     'Classification_Models/optimised_rfc_without_molecular_functions.joblib')

In [None]:
rfc_grid_search_dataframe = pd.DataFrame(model.cv_results_)
rfc_grid_search_dataframe.sort_values(by=["rank_test_f1"], inplace=True)
load_to_pickle(rfc_grid_search_dataframe, "rfc_grid_search_dataframe_without_molecular_functions")
rfc_grid_search_dataframe

NameError: name 'feature_selection_support' is not defined