In [2]:
!pip install transformers datasets rdkit pandas scikit-learn numpy deepchem

Collecting datasets
  Downloading datasets-3.5.1-py3-none-any.whl.metadata (19 kB)
Collecting rdkit
  Downloading rdkit-2024.9.6-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.0 kB)
Collecting deepchem
  Downloading deepchem-2.8.0-py3-none-any.whl.metadata (2.0 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.1-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.4/491.4 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading rdkit-2024.9.6-cp311-cp3

In [3]:
import deepchem as dc
import pandas as pd

tasks, datasets, transformers = dc.molnet.load_delaney(featurizer='ECFP', split='random')
train_dataset, test_dataset, valid_dataset = datasets
train_dataset

Instructions for updating:
experimental_relax_shapes is deprecated, use reduce_retracing instead


<DiskDataset X.shape: (np.int64(902), np.int64(1024)), y.shape: (np.int64(902), np.int64(1)), w.shape: (np.int64(902), np.int64(1)), ids: ['CCCCCCCCO' 'Cn1cnc2n(C)c(=O)n(C)c(=O)c12' 'CC#C' ... 'CCC#C'
 'CCCCC(CC)C=O' 'Cc1ccc2cc3ccccc3cc2c1'], task_names: ['measured log solubility in mols per litre']>

In [4]:
from sklearn.ensemble import RandomForestRegressor
from deepchem.models import SklearnModel

rf_model = SklearnModel(model=RandomForestRegressor(n_estimators=500, random_state=0))
rf_model.fit(train_dataset)

In [5]:
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

y_true = test_dataset.y
y_pred = rf_model.predict(test_dataset)

rmse = np.sqrt(mean_squared_error(y_true, y_pred))
r2_score=r2_score(y_true, y_pred)

results = []
results.append({
    'Модель': 'RandomForest',
    'RMSE': rmse,
    'R2': r2_score
})

print(f'RMSE: {rmse}')
print(f'R2: {r2_score}')

RMSE: 0.5857279585195224
R2: 0.6189610101144213


In [6]:
import joblib

joblib.dump(rf_model, 'rf_model.pkt')

['rf_model.pkt']

In [7]:
# Загрузка датасета ESOL
tasks, datasets, transformers = dc.molnet.load_delaney(featurizer='Raw', split=None)
dataset = datasets[0]



In [8]:
from rdkit import Chem
print(Chem.MolToSmiles(dataset.X[0]))

N#CC(OC1OC(COC2OC(CO)C(O)C(O)C2O)C(O)C(O)C1O)c1ccccc1


In [9]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification
tokenizer = RobertaTokenizer.from_pretrained("seyonec/ChemBERTa-zinc-base-v1")
model = RobertaForSequenceClassification.from_pretrained("seyonec/ChemBERTa-zinc-base-v1", num_labels=1, problem_type='regression')

def tokenize(smiles_list):
  return tokenizer(smiles_list, padding=True, truncation=True, return_tensors='pt', max_length=128)

smiles = list(dataset.ids)
logS = dataset.y
tokenized = tokenize(smiles)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/166 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/9.43k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/3.21k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/501 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/179M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at seyonec/ChemBERTa-zinc-base-v1 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
import torch

# Прогнозирование
with torch.no_grad():
    model.eval()
    inputs = tokenized
    outputs = model(**inputs)
    predictions = outputs.logits.squeeze().cpu().numpy()

model.safetensors:   0%|          | 0.00/179M [00:00<?, ?B/s]

In [11]:
import sklearn.metrics
# Реальные значения (logS)
labels = logS.flatten()

# Вычисление RMSE
rmse = np.sqrt(mean_squared_error(labels, predictions))
r2_score = sklearn.metrics.r2_score(labels, predictions)
results.append({
    'Модель': 'ChemBERTa (without pre-training)',
    'RMSE': rmse,
    'R2': r2_score
})

print(f"Test RMSE: {rmse}")
print(f'Test R2:{r2_score}')

Test RMSE: 1.0985271884482741
Test R2:-0.20676198376007004


In [14]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [17]:
import pandas as pd

general_path = '/content/drive/MyDrive/Snezhko-course-project-2024-25/2025/models/'
# Пути к файлам
gcn_metrics_path = general_path + 'GCN/gcn_model_evaluations.csv'  # Файл с метриками GCN
global_metrics_path = general_path + '/model_evaluations.csv'  # Общий файл статистики

# Загружаем метрики GCN
gcn_df = pd.read_csv(gcn_metrics_path)

# Получаем последнюю строку (метрики последней модели GCN)
last_gcn_metrics = gcn_df.iloc[[-1]]

# Извлекаем нужные значения из последней строки
gcn_model_name = last_gcn_metrics["Model Name"].values[0]
gcn_rmse = last_gcn_metrics["RMSE"].values[0]
gcn_r2 = last_gcn_metrics["R²"].values[0]

results.append({
    'Модель': gcn_model_name,
    'RMSE': gcn_rmse,
    'R2': gcn_r2
})

In [18]:
result_df = pd.DataFrame(results, index=None)

display(result_df)

Unnamed: 0,Модель,RMSE,R2
0,RandomForest,0.585728,0.618961
1,ChemBERTa (without pre-training),1.098527,-0.206762
2,GCN_Tox21,1.036997,0.772497


In [21]:
# Загружаем общий файл статистики, если он существует
try:
    global_df = pd.read_csv(global_metrics_path)
except FileNotFoundError:
    # Если файл не существует, создаем новый DataFrame
    global_df = pd.DataFrame(columns=["Модель", "RMSE", "R2"])

# Преобразуем результаты в DataFrame и добавляем их в общий файл
results_df = pd.DataFrame(results)
global_df = pd.concat([global_df, results_df], ignore_index=True)

# Сохраняем обновленный DataFrame обратно в CSV файл
global_df.to_csv(global_metrics_path, index=False)

print(f"Последние метрики GCN добавлены в общий файл статистики: {global_metrics_path}")


Последние метрики GCN добавлены в общий файл статистики: /content/drive/MyDrive/Snezhko-course-project-2024-25/2025/models//model_evaluations.csv


  global_df = pd.concat([global_df, results_df], ignore_index=True)
