In [None]:
import pandas as pd
import plotly.express as px

from sklearn.feature_selection import VarianceThreshold
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import  StandardScaler, MinMaxScaler
from sklearn.compose import ColumnTransformer

from utils.data_preparation import *
from utils.data_exploration import *

from rdkit import Chem

from feature_engine.selection import DropCorrelatedFeatures


task = 'CYP2C19'

## Loading the Dataset + Data Cleaning

In terms of data cleaning the following steps are performed:

- Normalization of smiles strings before calculating descriptors and fingerprints
  - Normalization includes the removal of metals in the molecule (<span style="color:cyan">TODO</span> Why?)
- Removal of small molecules
  - For example: 
    - Molecules consisting of a single atom (<span style="color:cyan">TODO</span> Why?)
    - Molecules that are metals
- *Molecular Descriptors:* Removing of NaN values by either removing the corresponding column or row. 
  - For molecular descriptors it doesn't make much sense to fill missing values with some default value or mean of the existing values


In the first iteration we will focus on using the Morgan fingerprints. If there is time later we will explore other fingerprints and compare. 

In [None]:
data = data_preprocessing(task)
# data = remove_small_molecules(data)
data = data.drop(["MACCS_FP", "ATOMPAIR_FP"], axis=1)

# turn string of fingerprints into single features
morgan_fingerprint_df = pd.DataFrame(convert_strings_to_int_array(data["Morgan_FP"].values), index=data.index)
data = data.merge(morgan_fingerprint_df, left_index=True, right_index=True)
data

### Remove missing values 

In [None]:
data_nan = extract_null(data)
data_nan

In [None]:
print(
    f"There are {data_nan.shape[0]} ({data_nan.shape[0]/data.shape[0]*100:.2f}%) molecules and {data_nan.shape[1]-3} descriptors with missing values."
)
summarize_descriptors(data_nan.columns[3:])

Since only 0.28% of molecules have missing values we simply remove those molecules since using a default value doesn't make much sense for the shown descriptors.

In [None]:
data = data.drop(data_nan.index)
data.shape

In [None]:
# split data in train, val, test
datasets = dataset_split(data.drop(["Drug", "Drug_ID"], axis=1))
datasets["train"]

## Dataset Exoploration

In [None]:
datasets["train"].describe()

In [None]:
plot_counts(
    [datasets["train"]["Y"], datasets["val"]["Y"], datasets["test"]["Y"]],
    suptitle="Distribution of the target label within each set",
    titles=["train", "validation", "test"],
    legend_title="CYP2C19 inhibition",
    kind="pie",
)

In [None]:
# The descriptors include discrete and continuous data, distinguished by their dtype. 

unique_dtypes = set(datasets["train"].dtypes)
print(f"Datatypes: {unique_dtypes}")

continuous_descriptors = list(datasets["train"].select_dtypes(include='float64').columns)
discrete_descriptors = list(datasets["train"].select_dtypes(include='int64').columns)
fingerprint_features = list(morgan_fingerprint_df.columns)
for fingerprint_feature in fingerprint_features:
    discrete_descriptors.remove(fingerprint_feature)

### Continuous Data

In [None]:
feature_distributions(
    data = datasets["train"][["Y"]+continuous_descriptors],
    features = continuous_descriptors[10:18],
    suptitle="Feature distributions given the target label using a KDE",
    task=f"{task} inhibition"
)

In [None]:
# Correlation matrix of descriptors
cor_matrix =  datasets["train"][continuous_descriptors].corr()
top_cor_matrix = cor_matrix.where(np.triu(np.ones(cor_matrix.shape),k=1).astype(np.bool))
fig = px.imshow(top_cor_matrix, color_continuous_scale='RdBu_r', title=f"{task} inhibition\nDescriptor correlation")

fig.write_html(f"data/{task.lower()}/descriptor_correlation.html")

### Discrete Data

In [None]:
feature_distributions(
    data=datasets["train"][discrete_descriptors],
    features=discrete_descriptors[5:11],
    kind="hist",
    suptitle="Feature Distributions given the target label",
    task="CYP2C19 inhibition",
)

## Feature Selection

There are in total 208 different descriptors. Relevant descriptors for the task of predicting CYP inhibition need to be selected to reduce the number of input variables to the clasical machine learning algorithm. Feature selection can either be performed unsupervised (without knowledge of the target label) or supervised.

**Note:** Some machine learning models have some form of feature selection inbuild, e.g. tree-based models. In those case we don't perform feature selection upfront.

Having a look at for example the number of radical electrons (NumRadicalElectrons) (<span style="color:cyan">TODO</span> Add description of NUmRadicalElectrons). We can see that all datapoints in the dataset have a value of 0 (min=max=0.0). 

In [None]:
print(summarize_descriptors(["NumRadicalElectrons"]))
datasets["train"].describe()["NumRadicalElectrons"]

In the area of feature selection there is a method called **variance threshold**: Given a threshold all features with a variance below this threshold will be removed. (<span style="color:cyan">TODO</span> Add better source; https://medium.com/nerd-for-tech/removing-constant-variables-feature-selection-463e2d6a30d9#:~:text=Variance%20Threshold%20is%20a%20feature,be%20used%20for%20unsupervised%20learning.)

The default value is usually 0 (removing constant features as they obviously bring no additional information to our model). If the variance threshold is greater than zero but still small we are removing quasi-constant features. The arguments against using a variance greater than 0 say that you may be moving variables that, although they have low variance, might actually be extremely powerful in explaining your target (dependent) variable.

For now, we are exploring which features are constant in our dataset.

In [None]:
print("Features with 0 variance:\n")
for index, n_unique in zip(datasets["train"].nunique(axis=0).index, datasets["train"].nunique(axis=0)):
    if n_unique == 1:
        print(index)

In [None]:
pipe = Pipeline(steps=[
    ("remove_zero_var", VarianceThreshold(threshold=0.0))])

# zero-variance + correlation (between features) PCA, mutual information (discrete and continuous)

As shown in the correlation matrix there are some feature groups in our dataset with high correlation. In order to escape the curse of dimensionality we want to remove features with a high correlation to other features - out of two features with high correlation only one remains. When features are collinear, permutating one feature will have little effect on the models performance because it can get the same information from a correlated feature. One way to handle multicollinear features is by performing hierarchical clustering on the Spearman rank-order correlations, picking a threshold, and keeping a single feature from each cluster. Source: https://scikit-learn.org/stable/auto_examples/inspection/plot_permutation_importance_multicollinear.html

The y-axis of the following dendrogram is a measure of closeness of either individual data points or clusters

In [None]:
plot_dendrogram(cor_matrix, level=7, color_threshold=2)

The idea of dropping highly correlated features is also applied by the following method: [DropCorrelatedFeatures](https://feature-engine.readthedocs.io/en/1.1.x/selection/DropCorrelatedFeatures.html) from the feature_engine. Here, features are removed on first found first removed basis, without any further insight using pearson correlation score.

In [None]:
# Exploring DropCorrelatedFeatures

drop_corr_features = DropCorrelatedFeatures(threshold=0.8)
print("Number of features before transformation:", datasets["train"][continuous_descriptors].shape[1])
reduced_continuous_data = drop_corr_features.fit_transform(datasets["train"][continuous_descriptors])
print("Number of features after transformation:", reduced_continuous_data.shape[1])

# Correlation matrix of descriptors
reduced_cor_matrix = reduced_continuous_data.corr()
reduced_top_cor_matrix = reduced_cor_matrix.where(
    np.triu(np.ones(reduced_cor_matrix.shape), k=1).astype(np.bool)
)
fig = px.imshow(
    reduced_top_cor_matrix,
    color_continuous_scale="RdBu_r",
    title=f"{task} inhibition\nDescriptor correlation after dropping highly correlated features",
)

fig.write_html(f"data/{task.lower()}/descriptor_correlation_pruned.html")

In [None]:
pipe = Pipeline(
    steps=[
        (
            "continuous-drop_corr_features",  # this needs to be first since it takes a DataFrame as an input
            DropCorrelatedFeatures(
                variables=continuous_descriptors,
                threshold=0.8,
            ),
        ),
        ("discrete-drop_zero_var", VarianceThreshold(threshold=0.0)),
    ]
)

## Dimensionality reduction

For continuous data we will perform a PCA to reduce the dimensionality of the features. Since PCA should only be applied to continuous data we will split our preprocessing pipeline into three parts:

1. Preprocessing of continuous descriptors
2. Preprocessing of discrete descriptors
3. Preprocessing of the fingerprint

In [None]:
continuous_preprocessing = Pipeline(
    steps=[  # DropCorrelatedFeatures needs to be first since it takes a DataFrame as an input
        (
            "drop_corr_features",
            DropCorrelatedFeatures(
                variables=continuous_descriptors,
                threshold=0.8,
            ),
        ),
        ("drop_zero_var", VarianceThreshold(threshold=0.0)),
        (
            "normalize",
            StandardScaler(),
        ),  # pca assumes mean=0 and variance=1
        ("pca", PCA(n_components=70)),
    ]
)

discrete_preprocessing = Pipeline(
    steps=[
        ("drop_zero_var", VarianceThreshold(threshold=0.0)),
        ("min_max_normalization", MinMaxScaler()),
    ]
)

fingerprint_preprocessing = Pipeline(
    steps=[
        ("drop_zero_var", VarianceThreshold(threshold=0.0)),
        ("min_max_normalization", MinMaxScaler()),
    ]
)

In [None]:
fingerprint_preprocessing.fit(datasets["train"][fingerprint_features], datasets["train"]["Y"])
discrete_preprocessing.fit(datasets["train"][discrete_descriptors], datasets["train"]["Y"])
continuous_preprocessing.fit(datasets["train"][continuous_descriptors], datasets["train"]["Y"])

# Next Steps

- Use Dendrogram [only continuous data] for feature selection (Jonna)
- Feature selection method for discrete data (James)
- Baseline
  - DummyClassifier
- Classical models
  - Random Forest (little feature selection) (Jonna)
  - KNN (James)
  - XGBoost (James)
  - SVM (Jonna)
  - Linear Models (Jonna)
  - Naive Bayes (James)
  - [later] simple NN
- Try giving weights to classes (will solve unbalanced data sets) (James)"