In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
dataset_train = pd.read_csv('xtrainCyclodextrin.csv')
dataset_test = pd.read_csv('xtestCyclodextrin.csv')

In [3]:
x_train = dataset_train
x_test = dataset_test

In [4]:
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler


selected_columns = ['K'] #['K','Erreur'] when I only consider K, the model gets significanntly better, idk why?
X = x_train[selected_columns]

# 2. Scale the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 3. Apply DBSCAN
dbscan = DBSCAN(eps=0.005, min_samples=9)  #I geot these parameters after fine tuning, see Untitled11
labels = dbscan.fit_predict(X_scaled)

# 4. Identify outliers
x_train['Cluster'] = labels
outliers = x_train[x_train['Cluster'] == -1]

print("Original Data:")
print(x_train.shape)
print("\nOutliers:")
print(outliers[['K']])

Original Data:
(2940, 50)

Outliers:
           K
9      37300
27      9420
38      6000
54    107358
58      9560
...      ...
2863    8100
2873    5777
2885    6627
2896    4434
2925   12700

[246 rows x 1 columns]


In [5]:
x_train = x_train[x_train['Cluster']!=-1].drop(['Cluster'],axis=1)

In [6]:
Erreur = np.abs(np.exp(x_train.loc[x_train["logK"].isnull()==False,"logK"]) - x_train.loc[x_train["logK"].isnull()==False,"K"])/10
x_train.loc[x_train["Erreur"].isnull()==True,"Erreur"] =Erreur
x_test.loc[x_test["Erreur"].isnull()==True,"Erreur"] =x_train["Erreur"].mean() ##verifier si l'approche marche ou nn 

In [7]:
y_train = x_train['K']

In [8]:
x_train = x_train.drop(['Original_Value','Guest',"Reference","CID_Guest","IsomericSMILES_Host","IsomericSMILES",'Charge_Host','K','logK'],axis=1)
x_test = dataset_test.drop(['Original_Value','Guest',"Reference","CID_Guest","IsomericSMILES_Host","IsomericSMILES",'Charge_Host'],axis=1)

In [9]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder,LabelEncoder
categorical_columns = ["Host"] 
Onehot = ["Host"]
label_encoder = [] 
for col in label_encoder:
    x_train[col] = LabelEncoder().fit_transform(x_train[col])
remainder_cols = [col for col in x_train.columns if col not in categorical_columns]
ct = ColumnTransformer(transformers=[('encoder',OneHotEncoder(sparse_output=False,handle_unknown="ignore"),Onehot)],remainder='passthrough')
x_train_array = ct.fit_transform(x_train)
encoded_feature_names = ct.named_transformers_['encoder'].get_feature_names_out(categorical_columns)
all_feature_names = list(encoded_feature_names) + remainder_cols
x_train = pd.DataFrame(x_train_array, columns=all_feature_names)

In [10]:
x_test_array = ct.transform(x_test)
for col in label_encoder:
    x_test[col] = LabelEncoder().fit_transform(x_test[col])
encoded_feature_names_test = ct.named_transformers_['encoder'].get_feature_names_out(categorical_columns)
all_feature_names_test = list(encoded_feature_names_test) + remainder_cols
x_test = pd.DataFrame(x_test_array, columns=all_feature_names)

In [11]:
lr, Gamma, n_est, md, sub, col, Rega, Regl =  [0.01, 0.01, 400, 11, 0.6, 0.8, 0.1, 1.5] ##I got these parameters after fine tuning, see Untitled11
best_params = {"learning_rate" : lr, 
               "gamma" : Gamma, 
               "n_estimators" : n_est, 
                "max_depth":md,
                "reg_lambda":Regl,
               "reg_alpha":Rega,
               "subsample":sub,
               "colsample_bytree":col,
              "booster" : 'gbtree', 
              "random_state":42}

In [12]:
from xgboost import XGBRegressor
XG_reg = XGBRegressor(
    **best_params
) 


In [13]:
from sklearn.model_selection import cross_val_score

# K-fold cross validation
scores = cross_val_score(estimator=XG_reg, X=x_train.values, y=y_train, cv=5,scoring = 'neg_mean_absolute_error')  # 5-fold CV is relevant to get an idea of the model's performance
print(f"MAE Scores: {-scores}")
print(f"Mean MAE Score: {-scores.mean()} (+/- {scores.std()})")

MAE Scores: [279.81172413 288.79032236 264.27838858 291.30112157 307.804741  ]
Mean MAE Score: 286.397259527006 (+/- 14.287537256432934)


In [14]:
scores = cross_val_score(estimator=XG_reg, X=x_train.values, y=y_train, cv=5,scoring = 'neg_mean_squared_error') # 5-fold CV is relevant to get an idea of the model's performance
scores = np.sqrt(-scores)
print(f"RMSE Scores: {scores}")
print(f"Mean RMSE Score: {scores.mean()} (+/- {scores.std()})")

RMSE Scores: [504.90584762 524.70186701 455.72058626 518.12556745 487.54005936]
Mean RMSE Score: 498.1987855408937 (+/- 24.74865856533938)


In [15]:
from sklearn.model_selection import cross_val_score

# K-fold cross validation
scores = cross_val_score(estimator=XG_reg, X=x_train.values, y=y_train, cv=5,scoring = 'r2')  # 5-fold CV is relevant to get an idea of the model's performance
print(f"CV Scores: {scores}")
print(f"Mean CV Score: {scores.mean():.3f} (+/- {scores.std() * 2:.3f})")

CV Scores: [0.56421113 0.54252005 0.6326493  0.55318791 0.58223283]
Mean CV Score: 0.575 (+/- 0.063)


In [16]:
XG_reg.fit(x_train.values,y_train)

In [17]:
y_pred = XG_reg.predict(x_test.values)

In [18]:
np.log(y_pred)

array([5.743252 , 4.11867  , 5.9752216, 4.8832183, 6.06333  , 3.8699543,
       6.3788047, 7.728261 , 5.4677286, 5.733649 , 6.303048 , 5.9243436,
       6.327798 , 6.513356 , 5.617495 , 5.190373 , 6.49611  , 6.109639 ,
       5.1338577, 5.01518  , 5.557375 , 6.368347 , 5.3302975, 4.5197883,
       6.201219 , 4.426853 , 6.3862705, 5.1872716, 6.9740043, 6.3928123,
       7.0301666, 4.458805 , 6.8325796, 5.7411804, 6.964149 , 3.6692832,
       7.315796 , 7.848902 , 7.255931 , 6.651521 , 6.8357215, 6.0788116,
       5.6454096, 4.3109555, 5.6186438, 6.3318257, 6.2573285, 4.9287987,
       5.314442 , 4.5327535, 6.3389797, 6.3294973, 6.6816726, 6.1951694,
       5.8479385, 5.5331483, 5.929769 , 4.742644 , 5.6189156, 6.4669952,
       5.1287594, 6.378237 , 6.1191263, 6.0427904, 5.938618 , 5.096092 ,
       7.1632366, 6.119626 , 4.11867  , 4.5661297, 7.9203825, 5.7421713,
       6.3582196, 6.3691216, 6.419768 , 7.0932946, 5.9940395, 5.6059837,
       5.7839217, 6.734102 , 5.3698616, 5.7747307, 