<a target="_blank" href="https://colab.research.google.com/github/compomics/ML-course-VIB-2024/blob/master/notebooks/Histone_marks_dt.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>

# Histone modifications

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import log_loss

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

random_seed = 42
np.random.seed(random_seed)
!pip install pydotplus

# 1. Reading the data

In [None]:
train = pd.read_csv("https://raw.githubusercontent.com/sdgroeve/ML-course-VIB-2020/master/data/data_train.csv")
test = pd.read_csv("https://raw.githubusercontent.com/sdgroeve/ML-course-VIB-2020/master/data/data_test.csv")

In [None]:
train_ids = train.pop("GeneId")
train_labels = train.pop("Label")

In [None]:
test_index_col = test.pop("GeneId")

# 2. Fitting a decision tree model

The scikit-learn `DecisionTreeClassifier` class computes a decision tree predictive model from a dataset. 

To get all the options for learning you can simply type: 

In [None]:
help(DecisionTreeClassifier)

You notice that there are many hyperparameters to set. Some of these have a large impact on the complexity of the fitted model. An important such hyperparameter is the `max_depth` that sets a limit on how deep a decision tree can become. 

Let's create a decision tree model with `max_depth=3`:

In [None]:
cls = DecisionTreeClassifier(max_depth=3)

This creates a decision tree model with default values for the other hyperparameters:

In [None]:
cls

Let's create a validation set, fit the model and evaluate.

In [None]:
from sklearn.metrics import log_loss, accuracy_score
from sklearn.model_selection import train_test_split

train_X, val_X, train_y, val_y = train_test_split(train,train_labels,
                                                  test_size=.2, random_state=random_seed)

cls.fit(train_X,train_y)

predictions_train = cls.predict(train_X)
predictions_val = cls.predict(val_X)

print("Accuracy: (%f) %f"%(accuracy_score(predictions_train, train_y),accuracy_score(predictions_val, val_y)))

predictions_train_prob = cls.predict_proba(train_X)
predictions_val_prob = cls.predict_proba(val_X)

print("Log-loss: (%f) %f"%(log_loss(train_y,predictions_train_prob[:,-1]),log_loss(val_y,predictions_val_prob[:,-1])))

The following code plots the fitted decision tree `cls` as a `tree.png` file:

In [None]:
from sklearn import tree
from io import StringIO
from IPython.display import Image, display
import pydotplus

out = StringIO()
tree.export_graphviz(cls, out_file=out)
graph=pydotplus.graph_from_dot_data(out.getvalue())
graph.write_png("tree.png")

How do other values for for the `max_depth` hyperparameter perform?

In [None]:
# Define the parameter grid
param_grid = {'max_depth': range(1, 10)}

# Initialize the DecisionTreeClassifier
clf = DecisionTreeClassifier(random_state=random_seed)

# Set up GridSearchCV with log-loss as the scoring metric
grid_search = GridSearchCV(
    estimator=clf,
    param_grid=param_grid,
    scoring='neg_log_loss',  # Use negative log-loss for compatibility with GridSearchCV
    cv=5,
    return_train_score=True,
    verbose=1
)

# Fit the model
grid_search.fit(train_X, train_y)

# Extract results
results = pd.DataFrame(grid_search.cv_results_)

# Prepare data for plotting
plot_data = pd.DataFrame({
    'max_depth': results['param_max_depth'],
    'log_loss_train': -results['mean_train_score'],  # Negate to get positive log-loss
    'log_loss_val': -results['mean_test_score']  # Negate to get positive log-loss
})

# Melt the DataFrame for easier plotting
plot_data_melted = plot_data.melt(
    id_vars='max_depth',
    value_vars=['log_loss_train', 'log_loss_val'],
    var_name='set',
    value_name='log-loss'
)

# Plot the log-loss for training and validation sets
sns.lineplot(x='max_depth', y='log-loss', hue='set', data=plot_data_melted)
plt.title('Log-Loss vs Max Depth')
plt.xlabel('Max Depth')
plt.ylabel('Log-Loss')
plt.grid(True, linestyle='--', alpha=0.7)
plt.show()

In [None]:
Based on what is selected to train on we can get 

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import log_loss, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
import seaborn as sns
import matplotlib.pyplot as plt

# Initialize the model
cls = DecisionTreeClassifier(max_depth=9)

# Initialize DataFrame to store predictions and true labels
predictions_df = pd.DataFrame()

# List to store per-iteration metrics
iteration_metrics = []

# Perform multiple train-test splits
for i in range(20):
    train_X, val_X, train_y, val_y = train_test_split(
        train, train_labels, test_size=0.5, random_state=i
    )

    cls.fit(train_X, train_y)
    predictions_val = cls.predict(val_X)
    predictions_val_prob = cls.predict_proba(val_X)[:, -1]  # Probabilities for the positive class

    # Calculate log-loss and accuracy
    iteration_log_loss = log_loss(val_y, predictions_val_prob)
    iteration_accuracy = accuracy_score(val_y, predictions_val)
    iteration_metrics.append((i, iteration_log_loss, iteration_accuracy))

    print(f"Seed: {i} | Log-Loss: {iteration_log_loss:.6f} | Accuracy: {iteration_accuracy:.6f}")

    # Store the predictions and true labels in a DataFrame
    fold_predictions = pd.DataFrame({
        'Instance': val_X.index,
        f'Seed_{i}': predictions_val_prob,
        'True_Label': val_y.values
    })

    # Merge predictions from each seed
    if predictions_df.empty:
        predictions_df = fold_predictions
    else:
        predictions_df = pd.merge(predictions_df, fold_predictions, on=['Instance', 'True_Label'], how='outer')

# Calculate the average prediction for each instance
predictions_df['Avg_Prediction'] = predictions_df.filter(like='Seed_').mean(axis=1)

# Calculate the log-loss based on the averaged predictions
average_log_loss = log_loss(predictions_df['True_Label'], predictions_df['Avg_Prediction'])
print(f"\nAverage Model Log-Loss Across All Seeds: {average_log_loss:.6f}")

# Convert iteration metrics to a DataFrame for further analysis if needed
metrics_df = pd.DataFrame(iteration_metrics, columns=['Seed', 'Log_Loss', 'Accuracy'])

# Plot the pairplot
sns.pairplot(predictions_df.filter(like='Seed_'))

# Adjust the layout explicitly
plt.tight_layout()
plt.suptitle("Pairwise Comparison of Predicted Probabilities Across Seeds", y=1.02)
plt.show()

# 5. Ensemble learning: bagging

We have seen that bias and variance play an important role in Machine Learning. 

Let's first see what bagging can do for our dataset. 

In [None]:
from sklearn.ensemble import BaggingClassifier

cls = BaggingClassifier(estimator=DecisionTreeClassifier(max_depth=10),random_state=random_seed)
                                                            
cls.fit(train_X,train_y)

predictions_train = cls.predict(train_X)
predictions_val = cls.predict(val_X)

print("Accuracy: (%f) %f"%(accuracy_score(predictions_train, train_y),accuracy_score(predictions_val, val_y)))

predictions_train_prob = cls.predict_proba(train_X)
predictions_val_prob = cls.predict_proba(val_X)

print("Log-loss: (%f) %f"%(log_loss(train_y,predictions_train_prob[:,1]),log_loss(val_y,predictions_val_prob[:,1])))

With the `RandomForestClassifier` the variance of the decision tree is reduced also by selecting features for decision tree contruction at random. Let's see how far we get with default hyperparameter values.   

In [None]:
from sklearn.ensemble import RandomForestClassifier

cls = RandomForestClassifier(random_state=random_seed)

cls.fit(train_X,train_y)

predictions_train = cls.predict(train_X)
predictions_val = cls.predict(val_X)

print("Accuracy: (%f) %f"%(accuracy_score(predictions_train, train_y),accuracy_score(predictions_val, val_y)))

predictions_train_prob = cls.predict_proba(train_X)
predictions_val_prob = cls.predict_proba(val_X)

print("Log-loss: (%f) %f"%(log_loss(train_y,predictions_train_prob[:,1]),log_loss(val_y,predictions_val_prob[:,1])))

Now lets do hyperparameter tuning:

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, log_loss

# Define the parameter grid for hyperparameter tuning
param_grid = {
    'n_estimators': [100, 200],  # Number of trees in the forest
    'max_depth': [None, 10, 20],    # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4]    # Minimum number of samples required to be at a leaf node
}

# Initialize the RandomForestClassifier
cls = RandomForestClassifier(random_state=random_seed)

# Set up GridSearchCV
grid_search = GridSearchCV(
    estimator=cls,
    param_grid=param_grid,
    scoring='neg_log_loss',  # Use negative log-loss as the evaluation metric
    cv=5,  # 5-fold cross-validation
    verbose=2,
    n_jobs=-1  # Use all available CPU cores
)

# Perform the grid search on the training data
grid_search.fit(train_X, train_y)

# Retrieve the best model and hyperparameters
best_model = grid_search.best_estimator_
print(f"Best Hyperparameters: {grid_search.best_params_}")

# Evaluate the best model on training and validation data
predictions_train = best_model.predict(train_X)
predictions_val = best_model.predict(val_X)

# Accuracy
accuracy_train = accuracy_score(train_y, predictions_train)
accuracy_val = accuracy_score(val_y, predictions_val)
print("Accuracy: (%f) %f" % (accuracy_train, accuracy_val))

# Predict probabilities
predictions_train_prob = best_model.predict_proba(train_X)[:, 1]
predictions_val_prob = best_model.predict_proba(val_X)[:, 1]

# Log-loss
log_loss_train = log_loss(train_y, predictions_train_prob)
log_loss_val = log_loss(val_y, predictions_val_prob)
print("Log-loss: (%f) %f" % (log_loss_train, log_loss_val))
