## Importing Essential Packages

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from imblearn.over_sampling import SMOTE
from sklearn import preprocessing

from sklearn.model_selection import (train_test_split, GridSearchCV,
                                     cross_val_score, cross_validate)

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             f1_score, matthews_corrcoef, confusion_matrix, 
                             classification_report)

### Load Dataset

In [None]:
data = pd.read_csv("qsar_androgen_receptor.csv",sep=";", header=None)
data.head(10)

#### The dataset has 1687 observations where the features are molecular fingerprints (1 to 1024) and the last column is the target
#### The next step is to split the dataset into features (x) and target (y)

In [None]:
x = data.iloc[:, 0:1024]
x.head(10)

In [None]:
y = data.iloc[:, -1]
label_encoder = preprocessing.LabelEncoder()
y = label_encoder.fit_transform(y)
y

### Checking the Data, Features and Target Dimensions

In [None]:
print("Data Dimensions: ", data.shape)
print("Features Dimensions: ", x.shape)
print("Target Dimensions: ", y.shape)

### The Problem is to Deal with Unbalanced Data

#### Check the Difference Between the number of active molecules (positive) and inactive molecules (negative)

In [None]:
frequency = data[1024].value_counts().reset_index()

frequency.columns = ['Activity', 'Frequency']
groups = pd.DataFrame(frequency)

mapping = {0: 'Inactive', 1: 'Active'}
groups['Activity'] = groups['Activity'].replace(mapping)
print(groups)

##### From now on, we deal with the data without correcting the unbalance characteristic
#### Two models were tested for this purpose, named Support Vector Classifier (SVC) and Random Forest (RF)

### Splitting the Data into Training and Test Set

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=0)

#### Since the features are molecular fingerprints, we do not use any preprocessing. Just dive directly into the model building

#### Firstly, the best hyperparameters were tuned, and then some common performance measurements were computed
#### A detailed report is given for the training and test sets here

## 1) Support Vector Classifier

In [None]:
#svc.get_params()
grid_svc = {"C": [0.1, 1, 10, 100, 1000],
            'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
            'degree': [2,3,4] ,
            "kernel": ["linear", "poly", "rbf", "sigmoid"]}

svc_cv = GridSearchCV(SVC(random_state=0), grid_svc, verbose=True, cv=3)
svc_cv.fit(x_train, y_train)

ytrain_pred_svc = svc_cv.predict(x_train)
ytest_pred_svc = svc_cv.predict(x_test)

print("Best hiperparameters: {}".format(svc_cv.best_params_))

In [None]:
f1_score_svc = f1_score(y_test, ytest_pred_svc, average="macro")
precision_score_svc = precision_score(y_test, ytest_pred_svc, average="macro")
accuracy_score_svc = accuracy_score(y_test, ytest_pred_svc)
recall_score_svc = recall_score(y_test, ytest_pred_svc, average="macro")
matthews_corrcoef_score_svc = matthews_corrcoef(y_test, ytest_pred_svc)

cm_svc = confusion_matrix(y_test, ytest_pred_svc)
sns.heatmap(cm_svc, annot=True, fmt=".0f", linewidths=1, square=True, cmap="Reds")

plt.ylabel("Actual Label", color="black")
plt.xlabel("Predicted Label", color="black")
plt.title(f"F1: {f1_score_svc:.2f}", size=14, color="black")
plt.show()

In [None]:
print("performance metrics for training")
print(classification_report(y_train, ytrain_pred_svc))

# performance metrics for test    
print("performance metrics for test")
print(classification_report(y_test, ytest_pred_svc))

## 2) Random Forest Classifier

In [None]:
grid_RF = {"n_estimators": np.arange(1, 120), 'max_features': [1, 20],
           "max_depth": [1, 50, None], 'bootstrap': [True, False]}

RF_cv = GridSearchCV(RandomForestClassifier(random_state=0), grid_RF, cv=3, verbose=1, scoring='neg_mean_squared_error')
RF_cv.fit(x_train, y_train)

ytrain_pred_RF = RF_cv.predict(x_train)
ytest_pred_RF = RF_cv.predict(x_test)

print("Best hiperparameters: {}".format(RF_cv.best_params_))

In [None]:
f1_score_RF = f1_score(y_test, ytest_pred_RF, average="macro")
precision_score_RF = precision_score(y_test, ytest_pred_RF, average="macro")
accuracy_score_RF = accuracy_score(y_test, ytest_pred_RF)
recall_score_RF = recall_score(y_test, ytest_pred_RF, average="macro")
matthews_corrcoef_score_RF = matthews_corrcoef(y_test, ytest_pred_RF)

cm_RF = confusion_matrix(y_test, ytest_pred_svc)
sns.heatmap(cm_RF, annot=True, fmt=".0f", linewidths=1, square=True, cmap="Reds")

plt.ylabel("Actual Label", color="black")
plt.xlabel("Predicted Label", color="black")
plt.title(f"F1: {f1_score_RF:.2f}", size=14, color="black")
plt.show()

In [None]:
print("performance metrics for training")
print(classification_report(y_train, ytrain_pred_RF))

# performance metrics for test    
print("performance metrics for test")
print(classification_report(y_test, ytest_pred_RF))

## Dealing with Unbalanced Classes

#### Here, the unbalance characteristic is considered, and SVC and RF are again employed to check the role of balancing the data

In [None]:
sm = SMOTE(random_state=0)
x_balanced, y_balanced = sm.fit_resample(x,y)

In [None]:
x_balanced.shape

In [None]:
y_balanced.shape

In [None]:
x_train_balanced, x_test_balanced, y_train_balanced, y_test_balanced = train_test_split(x_balanced, y_balanced, test_size=0.20, random_state=0)

## 1B) Support Vector Classifier

In [None]:
#svc.get_params()
grid_svc_B = {"C": [0.1, 1, 10, 100, 1000],
            'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
            'degree': [2,3,4] ,
            "kernel": ["linear", "poly", "rbf", "sigmoid"]}

svc_B_cv = GridSearchCV(SVC(random_state=0), grid_svc_B, verbose=True, cv=3)
svc_B_cv.fit(x_train_balanced, y_train_balanced)

ytrain_pred_svc_B = svc_B_cv.predict(x_train_balanced)
ytest_pred_svc_B = svc_B_cv.predict(x_test_balanced)

print("Best hiperparameters: {}".format(svc_B_cv.best_params_))

In [None]:
f1_score_svc_B = f1_score(y_test_balanced, ytest_pred_svc_B, average="macro")
precision_score_svc_B = precision_score(y_test_balanced, ytest_pred_svc_B, average="macro")
accuracy_score_svc_B = accuracy_score(y_test_balanced, ytest_pred_svc_B)
recall_score_svc_B = recall_score(y_test_balanced, ytest_pred_svc_B, average="macro")
matthews_corrcoef_score_svc_B = matthews_corrcoef(y_test_balanced, ytest_pred_svc_B)

cm_svc_B = confusion_matrix(y_test_balanced, ytest_pred_svc_B)
sns.heatmap(cm_svc_B, annot=True, fmt=".0f", linewidths=1, square=True, cmap="Reds")

plt.ylabel("Actual Label", color="black")
plt.xlabel("Predicted Label", color="black")
plt.title(f"F1: {f1_score_svc_B:.2f}", size=14, color="black")
plt.show()

In [None]:
print("performance metrics for training")
print(classification_report(y_train_balanced, ytrain_pred_svc_B))

# performance metrics for test    
print("performance metrics for test")
print(classification_report(y_test_balanced, ytest_pred_svc_B))

## 2B) Random Forest Classifier

In [None]:
grid_RF_B = {"n_estimators": np.arange(1, 120), 'max_features': [1, 20],
           "max_depth": [1, 50, None], 'bootstrap': [True, False]}

RF_cv_B = GridSearchCV(RandomForestClassifier(random_state=0), grid_RF_B, cv=3, verbose=1, scoring='neg_mean_squared_error')
RF_cv_B.fit(x_train_balanced, y_train_balanced)

ytrain_pred_RF_B = RF_cv.predict(x_train_balanced)
ytest_pred_RF_B = RF_cv.predict(x_test_balanced)

print("Best hiperparameters: {}".format(RF_cv.best_params_))

In [None]:
f1_score_RF_B = f1_score(y_test_balanced, ytest_pred_RF_B, average="macro")
precision_score_RF_B = precision_score(y_test_balanced, ytest_pred_RF_B, average="macro")
accuracy_score_RF_B = accuracy_score(y_test_balanced, ytest_pred_RF_B)
recall_score_RF_B = recall_score(y_test_balanced, ytest_pred_RF_B, average="macro")
matthews_corrcoef_score_RF_B = matthews_corrcoef(y_test_balanced, ytest_pred_RF_B)

cm_RF_B = confusion_matrix(y_test_balanced, ytest_pred_RF_B)
sns.heatmap(cm_RF_B, annot=True, fmt=".0f", linewidths=1, square=True, cmap="Reds")

plt.ylabel("Actual Label", color="black")
plt.xlabel("Predicted Label", color="black")
plt.title(f"F1: {f1_score_RF_B:.2f}", size=14, color="black")
plt.show()

In [None]:
print("performance metrics for training")
print(classification_report(y_train_balanced, ytrain_pred_RF_B))

# performance metrics for test    
print("performance metrics for test")
print(classification_report(y_test_balanced, ytest_pred_RF_B))

#### Both balanced models were compared in the following cell
#### It is possible to see that SVC has a superior performance in this case

In [None]:
models = pd.DataFrame(["Support Vector Classifier", "Random Forest Classifier"])

f1 = pd.DataFrame([f1_score_svc_B, f1_score_RF_B])

precision = pd.DataFrame([precision_score_svc, precision_score_RF_B])

accuracy = pd.DataFrame([accuracy_score_svc_B, accuracy_score_RF_B])

recall = pd.DataFrame([recall_score_svc_B, recall_score_RF_B])

matthews = pd.DataFrame([matthews_corrcoef_score_svc_B, matthews_corrcoef_score_RF_B])

df2 = pd.concat([models, f1, precision, accuracy, recall, matthews], axis=1)
df2.columns = ["Algorithm", "F1", "Precision", "Accuracy", "Recall", "Matthews"]

df2.style.highlight_max(subset=["F1", "Precision", "Accuracy", "Recall", "Matthews"], color="#01AEB5")