In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use("seaborn")
import warnings
warnings.filterwarnings('ignore')

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFE
from sklearn import svm
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, explained_variance_score, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
seed = 42

In [None]:
df = pd.read_excel(r"/content/drive/MyDrive/MPHARM/00_MPH/Final/RFR_top 50.xlsx")
df

In [None]:
print(df.columns)

In [None]:
df.info()

In [None]:
df.isnull()

In [None]:
df.isnull().sum()

In [None]:
df.isnull().sum().sum()

In [None]:
df2 = df.fillna(value = 0) 
df2

In [None]:
df2.isnull().sum().sum()

In [None]:
df2.info()

In [None]:
df2.describe()

In [None]:
df2.head()

In [None]:
df2.describe().T

In [None]:
X = df2.drop(['pCC50'],axis = 1)
X

In [None]:
Y = df2.pCC50
Y

In [None]:
X.shape

In [None]:
Y.shape

In [None]:
corr = df2.corr()

In [None]:
corr

In [None]:
corr.style.background_gradient(cmap='coolwarm')

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.20, random_state=seed)

In [None]:
print(X_train.shape,X_test.shape, Y_train.shape, Y_test.shape)

In [None]:
X_train

In [None]:
Y_train

In [None]:
X_test

In [None]:
Y_test

In [None]:
df2.hist(bins = 50, figsize=(20,15))
plt.savefig('figrfr1.png')
plt.show()

In [None]:
model = RandomForestRegressor(random_state=seed)

In [None]:
model.fit(X_train, Y_train)

In [None]:
Y_train_pred = model.predict(X_train)

In [None]:
print(Y_train_pred)

In [None]:
predictions1 = pd.DataFrame({'Y_train' : Y_train, 'Y_train_pred' : Y_train_pred})

In [None]:
predictions1

In [None]:
len(Y_train_pred)

In [None]:
print('The training r_sq is: %.4f'% model.score(X_train, Y_train))

In [None]:
print('The MAE is: %.4f'% mean_absolute_error(Y_train, Y_train_pred))

In [None]:
print('The RMSE is: %.4f'% np.sqrt(mean_squared_error(Y_train, Y_train_pred)))

In [None]:
print('The EVS is: %.4f'% explained_variance_score(Y_train, Y_train_pred))

In [None]:
plt.rcParams['figure.figsize'] = 10,6
x_ax = range(len(X_train))
plt.plot(x_ax, Y_train, label = 'Observed', color = 'k', linestyle = '-')
plt.plot(x_ax, Y_train_pred, label = 'Predicted', color = 'k', linestyle = '--')
plt.ylabel('pCC50')
plt.xlabel('Experimental Run')
plt.legend(bbox_to_anchor = (0.5, -0.2), loc = 'lower center', ncol = 2, frameon = False)

In [None]:
Y_test_pred = model.predict(X_test)

In [None]:
Y_test_pred

In [None]:
len(Y_test_pred)

In [None]:
predictions2 = pd.DataFrame({'Y_test' : Y_test, 'Y_test_pred' : Y_test_pred})

In [None]:
predictions2

In [None]:
print('The testing r_sq is: %.4f'% r2_score(Y_test, Y_test_pred))

In [None]:
print('The testing r_sq is: %.4f'% model.score(X_test, Y_test))

In [None]:
print('The MAE is: %.4f'% mean_absolute_error(Y_test, Y_test_pred))

In [None]:
print('The RMSE is: %.4f'% np.sqrt(mean_squared_error(Y_test, Y_test_pred)))

In [None]:
print('The EVS is: %.4f'% explained_variance_score(Y_test, Y_test_pred))

In [None]:
plt.rcParams['figure.figsize'] = 10,6
x_ax = range(len(X_test))
plt.plot(x_ax, Y_test, label = 'Observed', color = 'k', linestyle = '-')
plt.plot(x_ax, Y_test_pred, label = 'Predicted', color = 'k', linestyle = '--')
plt.ylabel('pCC50')
plt.xlabel('Experimental Run')
plt.legend(bbox_to_anchor = (0.5, -0.2), loc = 'lower center', ncol = 2, frameon = False)

In [None]:
from yellowbrick.regressor import PredictionError
visualizer = PredictionError(model)
visualizer.fit(X_train, Y_train)
visualizer.score(X_test, Y_test)
visualizer.poof()

In [None]:
from yellowbrick.regressor import ResidualsPlot
visualizer = ResidualsPlot(model)
visualizer.fit(X_train, Y_train)
visualizer.score(X_test, Y_test)
visualizer.poof()

In [None]:
sns.scatterplot(x=Y_train, y=Y_train_pred, alpha=0.6)
sns.lineplot(x=Y_train, y=Y_train)
plt.xlabel('Actual', fontsize = 14)
plt.ylabel('Predicted', fontsize = 14)
plt.title('Actual vs Predicted', fontsize = 17)
plt.show()

In [None]:
sns.scatterplot(x=Y_test, y=Y_test_pred, alpha=0.6)
sns.lineplot(x=Y_test, y=Y_test)
plt.xlabel('Actual', fontsize = 14)
plt.ylabel('Predicted', fontsize = 14)
plt.title('Actual vs Predicted', fontsize = 17)
plt.show()

In [None]:
model.score(X_test, Y_test)

In [None]:
MSE_test = round(np.mean(np.square(Y_test - Y_test_pred)),2)
RMSE_test = round(np.sqrt(MSE_test),2)
RMSE_test

In [None]:
my_param_grid = {'n_estimators':[10, 100, 500], 'max_features':['sqrt', 'log2'], 'max_depth': [5,10,20]} 

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
GridSearchCV(estimator=RandomForestRegressor(random_state=seed), param_grid = my_param_grid, refit = True, verbose = 3, cv=5)

In [None]:
grid = GridSearchCV(estimator=RandomForestRegressor(random_state=seed), param_grid = my_param_grid, refit = True, verbose = 3, cv=5)

In [None]:
grid.fit(X_train, Y_train)

In [None]:
grid.best_params_

In [None]:
Y_test_optimised = grid.predict(X_test)

In [None]:
Y_test_optimised

In [None]:
predictions2['Y_test_optimised'] = Y_test_optimised
predictions2.head()

In [None]:
sns.scatterplot(x=Y_test, y=Y_test_optimised, alpha=0.6)
sns.lineplot(x=Y_test, y=Y_test)
plt.xlabel('Actual count', fontsize = 14)
plt.ylabel('Predicted count', fontsize = 14)
plt.title('Actual vs Optimised Predicted', fontsize = 17)
plt.show()

In [None]:
grid.score(X_test, Y_test)

In [None]:
MSE_test = round(np.mean(np.square(Y_test - Y_test_optimised)),2)
RMSE_test = round(np.sqrt(MSE_test),2)
RMSE_test

In [None]:
print('The testing r_sq is: %.4f'% model.score(X_test, Y_test_optimised))

In [None]:
print('The MAE is: %.4f'% mean_absolute_error(Y_test, Y_test_optimised))

In [None]:
print('The RMSE is: %.4f'% np.sqrt(mean_squared_error(Y_test, Y_test_optimised)))

In [None]:
print('The EVS is: %.4f'% explained_variance_score(Y_test, Y_test_optimised))

In [None]:
plt.rcParams['figure.figsize'] = 10,6
x_ax = range(len(X_test))
plt.plot(x_ax, Y_test, label = 'Observed', color = 'k', linestyle = '-')
plt.plot(x_ax, Y_test_optimised, label = 'Optimised Predicted', color = 'k', linestyle = '--')
plt.ylabel('pCC50')
plt.xlabel('Experimental Run')
plt.legend(bbox_to_anchor = (0.5, -0.2), loc = 'lower center', ncol = 2, frameon = False)

In [None]:
Y_train_optimised = grid.predict(X_train)

In [None]:
Y_train_optimised

In [None]:
predictions1['Y_train_optimised'] = Y_train_optimised
predictions1.head()

In [None]:
sns.scatterplot(x=Y_train, y=Y_train_optimised, alpha=0.6)
sns.lineplot(x=Y_train, y=Y_train)
plt.xlabel('Actual', fontsize = 14)
plt.ylabel('Predicted', fontsize = 14)
plt.title('Actual vs Optimised Predicted', fontsize = 17)
plt.show()

In [None]:
print('The training r_sq is: %.4f'% model.score(X_train, Y_train_optimised))

In [None]:
print('The MAE is: %.4f'% mean_absolute_error(Y_train, Y_train_optimised))

In [None]:
print('The RMSE is: %.4f'% np.sqrt(mean_squared_error(Y_train, Y_train_optimised)))

In [None]:
print('The EVS is: %.4f'% explained_variance_score(Y_train, Y_train_optimised))

In [None]:
plt.rcParams['figure.figsize'] = 10,6
x_ax = range(len(X_train))
plt.plot(x_ax, Y_train, label = 'Observed', color = 'k', linestyle = '-')
plt.plot(x_ax, Y_train_optimised, label = 'Optimised Predicted', color = 'k', linestyle = '--')
plt.ylabel('pCC50')
plt.xlabel('Experimental Run')
plt.legend(bbox_to_anchor = (0.5, -0.2), loc = 'lower center', ncol = 2, frameon = False)

In [None]:
from yellowbrick.regressor import ResidualsPlot
visualizer = ResidualsPlot(model)
visualizer.fit(X_train, Y_train_optimised)
visualizer.score(X_test, Y_test_optimised)
visualizer.poof()

In [None]:
features = list(X_train.columns)

In [None]:
features

In [None]:
RFR = RandomForestRegressor(random_state=seed, n_estimators = 100, max_features='sqrt', max_depth=10)

In [None]:
RFR.fit(X_train, Y_train)

In [None]:
importance = RFR.feature_importances_
importance

In [None]:
FIM = pd.DataFrame({'features' : features, 'feature_importances' : importance})
FIM

In [None]:
fig = plt.figure(dpi=800)
plt.figure(figsize=(10,20))
plt.title('feature importances')
sns.barplot(y='features', x='feature_importances', data=FIM)
plt.savefig('figrfr2.png')

plt.show()

In [None]:
!pip install shap

In [None]:
import shap
X = df2.drop(['pCC50'],axis = 1)
explainer = shap.TreeExplainer(model,feature_perturbation="tree_path_dependent")
shap_values = explainer.shap_values(X)

In [None]:
plt.figure(dpi=300)
shap.summary_plot(shap_values, X,show=False,color_bar=False)
plt.xlabel("SHAP value of pCC50 model" ,fontweight='bold',fontsize=6)
plt.tick_params(labelsize=6,)
plt.savefig('figrfr3.png')

plt.show()