In [None]:
import pickle
import pandas as pd
import plotly.express as px
from feature_engine.selection import DropCorrelatedFeatures

from skopt.space import Real, Integer, Categorical
from skopt.plots import plot_evaluations, plot_convergence, plot_objective
from skopt.utils import dump, load

from utils.data_preparation import *
from utils.data_exploration import *
from utils.training import *

from sklearn.feature_selection import VarianceThreshold
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVC
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier


task = "cyp2c19"

## Loading the Dataset + Data Cleaning

In terms of data cleaning the following steps are performed:

- Normalization of smiles strings before calculating descriptors and fingerprints
  - Normalization includes the removal of metals in the molecule (<span style="color:cyan">TODO</span> Why?)
- Removal of small molecules
  - For example: 
    - Molecules consisting of a single atom (<span style="color:cyan">TODO</span> Why?)
    - Molecules that are metals
- *Molecular Descriptors:* Removing of NaN values by either removing the corresponding column or row. 
  - For molecular descriptors it doesn't make much sense to fill missing values with some default value or mean of the existing values


In the first iteration we will focus on using the Morgan fingerprints. If there is time later we will explore other fingerprints and compare. 

In [None]:
data = data_preprocessing(task)
data = data.drop(["MACCS_FP", "ATOMPAIR_FP"], axis=1)
data = select_druglike_molecules(data)
# data = remove_small_molecules(data)

# turn string of fingerprints into single features
morgan_fingerprint_df = pd.DataFrame(
    convert_strings_to_int_array(data["Morgan_FP"].values), index=data.index
)
data = data.merge(morgan_fingerprint_df, left_index=True, right_index=True)

data

### Remove missing values 
Since less than 1% of molecules have missing values we simply remove those molecules since using a default value doesn't make much sense for the shown descriptors.

In [None]:
data_nan = extract_null(data)
print(
    f"There are {data_nan.shape[0]} ({data_nan.shape[0]/data.shape[0]*100:.2f}%) molecules and {data_nan.shape[1]-3} descriptors with missing values."
)
data_nan

In [None]:
data = data.drop(data_nan.index)
data.shape

### Train-Validation-Test split


In [None]:
# split data in train, val, test
datasets = dataset_split(data.drop(["Drug", "Drug_ID", "Morgan_FP"], axis=1))

In [None]:
# The descriptors include discrete and continuous data, distinguished by their dtype.

feature_groups = get_feature_groups(datasets, morgan_fingerprint_df)

## Dataset Exoploration

In [None]:
datasets["train"].describe()

In [None]:
plot_counts(
    [datasets["train"]["Y"], datasets["val"]["Y"], datasets["test"]["Y"]],
    suptitle="Distribution of the target label within each set",
    titles=["train", "validation", "test"],
    legend_title="CYP2C19 inhibition",
    kind="pie",
)

### Continuous Data

In [None]:
feature_distributions(
    data=datasets["train"][["Y"] + feature_groups.continuous],
    features=feature_groups.continuous[10:14],
    suptitle="Feature distributions given the target label using a KDE",
    task=f"{task} inhibition",
)

In [None]:
# Correlation matrix of descriptors
cor_matrix = datasets["train"][feature_groups.continuous].corr()
top_cor_matrix = cor_matrix.where(
    np.triu(np.ones(cor_matrix.shape), k=1).astype(np.bool)
)
fig = px.imshow(
    top_cor_matrix,
    color_continuous_scale="RdBu_r",
    title=f"{task} inhibition\nDescriptor correlation",
)

fig.write_html(f"data/{task.lower()}/descriptor_correlation.html")


### Discrete Data

In [None]:
feature_distributions(
    data=datasets["train"][["Y"] + feature_groups.discrete],
    features=feature_groups.discrete[5:9],
    kind="hist",
    suptitle="Feature Distributions given the target label",
    task="CYP2C19 inhibition",
)


## Feature Selection

There are in total 208 different descriptors. Relevant descriptors for the task of predicting CYP inhibition need to be selected to reduce the number of input variables to the clasical machine learning algorithm. Feature selection can either be performed unsupervised (without knowledge of the target label) or supervised.

**Note:** Some machine learning models have some form of feature selection inbuild, e.g. tree-based models. In those cases we don't perform feature selection upfront.

### Variance Threshold

Having a look at for example the number of radical electrons (NumRadicalElectrons). We can see that all datapoints in the dataset have a value of 0 (min=max=0.0). 

In the area of feature selection there is a method called **variance threshold**: Given a threshold all features with a variance below this threshold will be removed. (<span style="color:cyan">TODO</span> Add better source; https://medium.com/nerd-for-tech/removing-constant-variables-feature-selection-463e2d6a30d9#:~:text=Variance%20Threshold%20is%20a%20feature,be%20used%20for%20unsupervised%20learning.)

The default value is usually 0 (removing constant features as they obviously bring no additional information to our model). If the variance threshold is greater than zero but still small we are removing quasi-constant features. The arguments against using a variance greater than 0 say that you may be moving variables that, although they have low variance, might actually be extremely powerful in explaining your target (dependent) variable.

For now, we are exploring which features are constant in our dataset.

In [None]:
print(summarize_descriptors(["NumRadicalElectrons"]))
datasets["train"]["NumRadicalElectrons"].describe()

In [None]:
print("Features with 0 variance:\n")
for index, n_unique in zip(
    datasets["train"].nunique(axis=0).index, datasets["train"].nunique(axis=0)
):
    if n_unique == 1:
        print(index)


### Drop Correlated Features

As shown in the correlation matrix there are some feature groups in our dataset with high correlation. In order to escape the curse of dimensionality we want to remove features with a high correlation to other features - out of two features with high correlation only one remains. When features are collinear, permutating one feature will have little effect on the models performance because it can get the same information from a correlated feature. One way to handle multicollinear features is by performing hierarchical clustering on the Spearman rank-order correlations, picking a threshold, and keeping a single feature from each cluster. Source: https://scikit-learn.org/stable/auto_examples/inspection/plot_permutation_importance_multicollinear.html

The y-axis of the following dendrogram is a measure of closeness of either individual data points or clusters. 


The idea of dropping highly correlated features is also applied by the following method: [DropCorrelatedFeatures](https://feature-engine.readthedocs.io/en/1.1.x/selection/DropCorrelatedFeatures.html) from the feature_engine. Here, features are removed on first found first removed basis, without any further insight using pearson correlation score.

In [None]:
plot_dendrogram(cor_matrix, level=7, color_threshold=2)

In [None]:
# Exploring DropCorrelatedFeatures

drop_corr_features = DropCorrelatedFeatures(threshold=0.8)
print(
    "Number of features before transformation:",
    datasets["train"][feature_groups.continuous].shape[1],
)
reduced_continuous_data = drop_corr_features.fit_transform(
    datasets["train"][feature_groups.continuous]
)
print("Number of features after transformation:", reduced_continuous_data.shape[1])

# Correlation matrix of descriptors
reduced_cor_matrix = reduced_continuous_data.corr()
reduced_top_cor_matrix = reduced_cor_matrix.where(
    np.triu(np.ones(reduced_cor_matrix.shape), k=1).astype(np.bool)
)
fig = px.imshow(
    reduced_top_cor_matrix,
    color_continuous_scale="RdBu_r",
    title=f"{task} inhibition\nDescriptor correlation after dropping highly correlated features",
)

fig.write_html(f"data/{task.lower()}/descriptor_correlation_pruned.html")

### Select Percentile

For discrete features and fingerprints we are using a mutual information statistical test and apply multivariate feature selection.

## Dimensionality reduction

### PCA

For continuous data we will perform a PCA to reduce the dimensionality of the features. Since PCA should only be applied to continuous data we will split our preprocessing pipeline into three parts:

1. Preprocessing of continuous descriptors
2. Preprocessing of discrete descriptors
3. Preprocessing of the fingerprint

See DataPreprocessing in utils/training.py for the exact preprocessing pipelines.

## Feature Normalization

For continuous features we are using StandardScaler as this is assumed by PCA. For discrete descriptors we are using a MinMaxScaler. Since fingerprint features are binary we don't normalize them.

## Training

### Dummy Classifier

In [None]:
get_baseline(datasets)

### SVC

To keep track of old Bayesian Optimization runs and their differences:

#### SVC_bayesian_0


In [None]:
svc_bayesian_optimizer = BayesianOptimization(
    model=SVC,
    file_name=f"{task}/svc_bayesian_0", 
    model_params=[
        Real(name="C", low=0.1, high=4.0)
    ],
    datasets=datasets,
    feature_groups=feature_groups,
)

# svc_results = svc_bayesian_optimizer.optimize() 

# LOAD results from bayesian optimization
svc_results = pd.read_csv(f"optimization/{task}/svc_bayesian_0").drop("Unnamed: 0", axis=1)

svc_results.sort_values("val_accuracy")

In [None]:
svc_bayesian_optimizer.best_confusion_matrix(svc_results)

#### svc_random_0

We can do random search by setting n_calls = n_initial_points.

In [None]:
svc_bayesian_optimizer = BayesianOptimization(
    model=SVC,
    file_name=f"{task}/svc_random_0", 
    model_params=[
        Real(name="C", low=0.1, high=4.0),
    ],
    datasets=datasets,
    feature_groups=feature_groups,
)

# results = svc_bayesian_optimizer.optimize(n_calls=50, n_initial_points=50) 

# LOAD results from bayesian optimization
# results = load(f"optimization/{task}/svc_random_0")

# svc_bayesian_optimizer.best_confusion_matrix(results)

#### svc_random_1_poly

In [None]:
svc_random_1_poly_optimizer = BayesianOptimization(
    model=SVC,
    file_name=f"{task}/svc_random_1_poly", 
    model_params=[
        Real(name="C", low=0.1, high=4.0),
        Categorical(name="kernel", categories=["poly"]),
        Integer(name="degree", low=3, high=12)
    ],
    datasets=datasets,
    feature_groups=feature_groups,
    preprocessing_params=[
                Categorical(name="var_threshold_continuous", categories=[0.0]),
                Real(name="var_threshold_discrete", low=0.02, high=0.05),
                Categorical(name="var_threshold_fingerprint", categories=[0.0]),
                Real(name="corr_threshold", low=0.7, high=0.95),
            ]
)

# svc_random_1_poly = svc_bayesian_optimizer.optimize(n_calls=50, n_initial_points=50) 

# LOAD results from bayesian optimization
svc_random_1_poly = pd.read_csv(f"optimization/{task}/svc_random_1_poly").drop("Unnamed: 0", axis=1)

svc_random_1_poly.sort_values("val_accuracy")

In [None]:
# takes about 2 minutes
svc_random_1_poly_optimizer.best_confusion_matrix(svc_random_1_poly) 

### Weighted SVC
#### svc_weighted_bayesian_0

In [None]:
svc_weighted_bayesian_0_bayesian_optimizer = BayesianOptimization(
    model=SVC,
    file_name=f"{task}/svc_weighted_bayesian_0", 
    model_params=[
        Real(name="C", low=0.1, high=4.0)
    ],
    datasets=datasets,
    feature_groups=feature_groups,
)

# svc_weighted_bayesian_0 = svc_bayesian_optimizer.optimize() 

# LOAD results from bayesian optimization
# svc_weighted_bayesian_0 = pd.read_csv(f"optimization/{task}/svc_weighted_bayesian_0").drop("Unnamed: 0", axis=1)

svc_weighted_bayesian_0.sort_values("val_accuracy")

### RandomForestClassifier

For a random forest classifier we don't need to do any preprocessing. A decision tree based classifier is scale variant and has inbuild feature selection.

#### max_depth

turn following cell into python to re-run / edit

In [None]:
def train_random_forest_depth(datasets, file_path):
    if exists(file_path):
        return pd.read_csv(file_path).drop("Unnamed: 0", axis=1)
    max_depths = [depth for depth in range(3, 70)]
    metric_values = []
    x_train = np.array(datasets["train"].drop("Y", axis=1))
    y_train = np.array(datasets["train"]["Y"])
    x_val = np.array(datasets["val"].drop("Y", axis=1))
    y_val = np.array(datasets["val"]["Y"])

    with open(file_path, "w") as f:
        f.write(f",max_depth,val_accuracy\n")
        for idx, max_depth in enumerate(max_depths):
            rf = RandomForestClassifier(max_depth=max_depth, n_jobs=-1)
            rf.fit(x_train, y_train)
            y_pred = rf.predict(x_val)
            acc = accuracy_score(y_val, y_pred)
            print(
                f"Completed run {idx}/{len(max_depths)}: max_depth={max_depth}, accuracy={acc}"
            )
            metric_values.append(acc)
            f.write(f"{idx}, {max_depth}, {acc}\n")

    rf_results = pd.read_csv(file_path).drop("Unnamed: 0", axis=1)
    return rf_results


rf_max_depth = train_random_forest_depth(
    datasets=datasets, file_path=f"optimization/{task}/rf_max_depth"
)

rf_max_depth.sort_values("val_accuracy", ascending=False)

In [None]:
plot_parameter_metric(
    metric_values=rf_max_depth["val_accuracy"],
    model_name="RandomForestClassifier",
    metric="validation accuracy",
    parameter="max_depth",
    param_values=rf_max_depth["max_depth"],
)

In [None]:
x_train = np.array(datasets["train"].drop("Y", axis=1))
y_train = np.array(datasets["train"]["Y"])
x_val = np.array(datasets["val"].drop("Y", axis=1))
y_val = np.array(datasets["val"]["Y"])

best_rf = RandomForestClassifier(max_depth = 38, n_jobs=-1)
best_rf.fit(x_train, y_train)
y_pred = best_rf.predict(x_val)
plot_confusion_matrix(y_val, y_pred, f"RandomForestClassifier(max_depth=38)")

### Logistic Regression

#### lr_bayesian_0

In [None]:
from sklearn.linear_model import LogisticRegression

lr_bayesian_optimizer = BayesianOptimization(
    model=LogisticRegression,
    file_name=f"{task}/lr_bayesian_0", 
    model_params=[
        Categorical(name="penalty", categories=["l1", "l2"]),
        Real(name="C", low=0.1, high=4.0),
        Categorical(name="solver", categories=["saga"]),
        Categorical(name="n_jobs", categories=[-1])
    ],
    datasets=datasets,
    feature_groups=feature_groups,
)
 
#lr_results = lr_bayesian_optimizer.optimize() 

lr_results = pd.read_csv(f"optimization/{task}/lr_bayesian_0")
lr_results.drop("Unnamed: 0", axis=1).sort_values("val_accuracy")

In [None]:
lr_bayesian_optimizer.best_confusion_matrix(lr_results)

# Next Steps

- ~~Use Dendrogram [only continuous data] for feature selection (Jonna)~~
- Feature selection method for discrete data (James)
- ~~Baseline~~
  - ~~DummyClassifier~~
- Classical models
  - ~~Random Forest (little feature selection) (Jonna)~~
  - KNN + Bayesian Optimization (James)
  - XGBoost + Bayesian Optimization (James)
  - ~~SVC (Jonna)~~
  - Linear Models (Jonna)
  - Naive Bayes + Bayesian Optimization (James)
  - [later] simple NN
- Try giving weights to classes (will solve unbalanced data sets)
- Get report working (James)
- ~~Bayesian Optimization (Jonna)~~
- apply the notebook to the other datasets