### Imports

In [None]:
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from pprint import pprint

from matplotlib import pyplot as plt
from sklearn.metrics import roc_curve

from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.model_selection import cross_val_score
from sklearn.utils import resample
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
import joblib
from sklearn.inspection import permutation_importance

Divisão do dataset em treino e em teste.

In [None]:
X = dataset.drop('Y', axis=1)
y = dataset['Y']

train_X, train_y, test_X, test_y= train_test_split(X, y, test_size=0.2, random_state=42)

# Modelos de ML
A nossa task, prever o nivel de sinergia entre duas drogas, o que se trata de um problema de regressão, logo só poderemos utilizar modelos que se baseiem em regressão.

## Logistical Regression

In [None]:
logreg = LogisticRegression(random_state=45, max_iter=1000)
logreg.fit(train_X, train_y)

print(f'Accuracy: {logreg.score(test_X, test_y):.2%}')

from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

pred_y = logreg.predict(test_X)
pprint(classification_report(test_y, pred_y))

In [None]:
print('Confusion Matrix:')
confusion_matrix(test_y, pred_y)

In [None]:
#define metrics
y_pred_proba = logreg.predict_proba(test_y)[::,1]
fpr, tpr, _ = roc_curve(test_y,  y_pred_proba)
auc = roc_auc_score(test_y, y_pred_proba)

#create ROC curve
plt.plot(fpr,tpr,label="AUC="+str(auc))
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.legend(loc=4)
plt.show()

## Decision Tree

In [None]:
dtree = DecisionTreeRegressor(random_state=45)
dtree.fit(train_X, train_y)

print(f'Accuracy: {dtree.score(test_X, test_y):.2%}')

pred_y = dtree.predict(test_X)
pprint(classification_report(test_y, pred_y))

## Random Forest

In [None]:
rfc = RandomForestRegressor(random_state=45)
rfc.fit(train_X, train_y)

print(f'Accuracy: {rfc.score(test_X, test_y):.2%}')

pred_y = rfc.predict(test_X)
pprint(classification_report(test_y, pred_y))

## SVR (Support Vector Regression)

In [None]:
svr = SVR()
svr.fit(train_X, train_y)

print(f'Accuracy: {svr.score(test_X, test_y):.2%}')

pred_y = svr.predict(test_X)
pprint(classification_report(test_y, pred_y))

IMPORTANTE: Podemos fazer como está em cima um de cada vez, ou entao, se quisermos treinar e testar todos os modelos de uma vez o prof usou isto na aula 10, ja pus os modelos que vi que deviamos usar, vejam se faz sentido.

In [None]:
models = [LogisticRegression(random_state=42, max_iter=1000),
          DecisionTreeRegressor(random_state=42),
          RandomForestRegressor(random_state=42),
          SVR()]

for model in models:
    model.fit(train_X, train_y)
    print(model.__class__.__name__)
    print('Accuracy on test set:', model.score(test_X, test_y))
    print('Classification report:\n', classification_report(test_y, model.predict(test_X)))
    print('Confusion matrix:\n', confusion_matrix(test_y, model.predict(test_X)))
    print('ROC AUC score:', roc_auc_score(test_y, model.predict_proba(test_X)[::,1]))
    print('-------------------')

## Model Validation

In [None]:
# Model validation

# cross validation
scores = cross_val_score(logreg, train_X, train_y, cv=5)
print('Cross validation scores:', scores)
print('Mean cross validation score:', scores.mean())

In [None]:
# Bootstrap
scores = []
for i in range(1000):
    X_boot, y_boot = resample(X_train, y_train)
    logreg.fit(X_boot, y_boot)
    scores.append(logreg.score(X_test, y_test))
    
print('Mean bootstrap score:', np.mean(scores))
print('Standard deviation of bootstrap scores:', np.std(scores))

## Hyperparameter tuning

We will use random search to find the best hyperparameters for our models.

In [None]:
# Hyperparameter tuning

# random forest hyperparameter tuning
param_grid = {'n_estimators': [10, 100, 1000],
              'max_depth': [None, 5, 10, 20],
              'max_features': ['auto', 'sqrt'],
              'min_samples_split': [2, 5, 10],
              'min_samples_leaf': [1, 2, 4]}

rf = RandomForestRegressor(random_state=42)
rand_search = RandomizedSearchCV(rf, param_grid, cv=5, verbose=2, n_jobs=-1, n_iter=5)
rand_search.fit(train_X, train_y)
rand_search.best_params_, rand_search.best_score_, rand_search.best_estimator_.score(test_X, test_y)
mse = mean_squared_error(test_X, pred_y)
print(f'Mean Squared Error: {mse}')


#grid_search = GridSearchCV(rf_model, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
#grid_search.best_estimator_ # best model

## Save and load the best model

In [None]:
# save the best model

joblib.dump(rand_search.best_estimator_, 'best_model.pkl')

# load the best model
best_model = joblib.load('best_model.pkl')
best_model.score(test_X, test_y)

## Model interpretation

scikit-learn provides multiple methods for model interpretation. Here we will see feature importance and permutation importance.


In [None]:
# Model interpretation
# feature importance

rf = RandomForestRegressor(random_state=42, n_estimators=1000, max_depth=10, max_features='sqrt', min_samples_split=5, min_samples_leaf=2)
rf.fit(train_X, train_y)
rf.feature_importances_

# plot feature importance
importances = pd.Series(rf.feature_importances_, index=selected_columns)  #este selected columns vão ser as selecionadas na feature selection, mas como não está importado não lê esta variável
importances.nlargest(10).plot(kind='barh')
plt.show()

In [None]:
# feature importance based on permutation importance

perm_importance = permutation_importance(rf, test_X, test_y)
sorted_idx = perm_importance.importances_mean.argsort()
plt.barh(selected_columns[sorted_idx[:10]], perm_importance.importances_mean[sorted_idx[:10]])
plt.xlabel("Permutation Importance")
plt.show()

In [None]:

pos_neg_idx = np.concatenate((sorted_idx[:10], sorted_idx[-10:]))
plt.barh(selected_columns[pos_neg_idx], perm_importance.importances_mean[pos_neg_idx])
plt.xlabel("Permutation Importance")
plt.show()