In [2]:
import torch
from transformers import AutoModel, AutoTokenizer
import pandas as pd
from loguru import logger

class MolEmbeddingsExtractor:
    def __init__(self, model_name):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name)
        
    def tokenize_smiles(self, smiles):
        tokens = self.tokenizer.encode(smiles, add_special_tokens=True)
        input_ids = torch.tensor(tokens).unsqueeze(0)  # добавляем размерность пакета
        attention_mask = torch.ones(input_ids.shape, dtype=torch.long)
        return input_ids, attention_mask
    
    def get_smiles_embeddings(self, smiles_list):
        mol_embeddings_list = []
        
        for smiles in smiles_list:
            input_ids, attention_mask = self.tokenize_smiles(smiles)
            with torch.no_grad():
                outputs = self.model(input_ids, attention_mask=attention_mask)
            mol_embeddings = outputs[0].mean(dim=1)
            mol_embeddings_list.append(mol_embeddings)
        
        mol_embeddings = torch.cat(mol_embeddings_list).mean(dim=0)
        return mol_embeddings.numpy()
    
    def process_dataframe(self, input_df):
        frame = pd.DataFrame(columns=['blend_id','smiles','oil_property_param_value','mixed_smiles'])
        blend_ids = input_df['blend_id'].unique()
        
        
        for blend_id in blend_ids:
            pivot_table = input_df[input_df['blend_id'] == blend_id]
            smiles_list = pivot_table['smiles'].values.tolist()
            smiles_embs = self.get_smiles_embeddings(smiles_list)
            pivot_table["mixed_smiles"] = [smiles_embs] * len(pivot_table)  # Repeat the numpy array to match the DataFrame length
            frame = pd.concat([frame, pivot_table], axis=0, ignore_index=True)
        
        return frame

if __name__ == "__main__":
    model_name = "seyonec/ChemBERTa-zinc-base-v1"
    extractor = MolEmbeddingsExtractor(model_name)
    
    df_smiles_train = pd.read_csv(r"..\data\smiles_train_set.csv")
    df_smiles_test=pd.read_csv(r"..\data\smiles_test_set_public.csv")
    train_set = extractor.process_dataframe(df_smiles_train)
    test_set = extractor.process_dataframe(df_smiles_test)
    

  from .autonotebook import tqdm as notebook_tqdm
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pivot_table["mixed_smiles"] = [smiles_embs] * len(pivot_table)  # Repeat the numpy array to match the DataFrame length
  frame = pd.concat([frame, pivot_table], axis=0, ignore_index=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pivot_table["mixed_smiles"] = [smiles_embs] * len(pivot_table)  # Repeat the numpy array to match the DataFrame length
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = valu

In [3]:
train_set=train_set.dropna()

In [4]:
train_set

Unnamed: 0,blend_id,smiles,oil_property_param_value,mixed_smiles
0,49743a76-a614-11ee-9529-005056921581,CCCCC,103300.0,"[1.3650681, 1.4397022, 0.3862609, -1.692783, 0..."
1,49743a76-a614-11ee-9529-005056921581,CCCC(C)CCC,103300.0,"[1.3650681, 1.4397022, 0.3862609, -1.692783, 0..."
2,49743a76-a614-11ee-9529-005056921581,CCC(C(OC)=O)CC,103300.0,"[1.3650681, 1.4397022, 0.3862609, -1.692783, 0..."
3,49743a76-a614-11ee-9529-005056921581,CCCCC(C)C,103300.0,"[1.3650681, 1.4397022, 0.3862609, -1.692783, 0..."
4,49743a76-a614-11ee-9529-005056921581,CC(C)(C)CC(C)(C)C,103300.0,"[1.3650681, 1.4397022, 0.3862609, -1.692783, 0..."
...,...,...,...,...
1382,6babd070-4bf3-11ee-9c35-005056921581,CCCCCCCCC(CCCCCC)CC(C)CCCCCCCC,189150.0,"[1.1391077, 0.48837262, 0.16649444, -0.7171905..."
1383,45f1e44a-9410-11ee-8abf-005056921581,CCCC(C)CCC,12510.0,"[1.358669, 1.4944685, 0.31815097, -1.7526857, ..."
1384,45f1e44a-9410-11ee-8abf-005056921581,CCC(C(OC)=O)CC,12510.0,"[1.358669, 1.4944685, 0.31815097, -1.7526857, ..."
1385,45f1e44a-9410-11ee-8abf-005056921581,CCCCCCC,12510.0,"[1.358669, 1.4944685, 0.31815097, -1.7526857, ..."


In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train = train_set['mixed_smiles'].values.tolist(),test_set['mixed_smiles'].values.tolist(),train_set['oil_property_param_value'].values

In [6]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import ElasticNet, LassoLars, SGDRegressor, RANSACRegressor, Ridge
from sklearn.ensemble import RandomForestRegressor

# Определение конвейера с регрессорами
pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Предварительная обработка данных
    ('regressor', DecisionTreeRegressor())  # Регрессор по умолчанию (можно изменить)
])

# Словарь параметров для поиска по сетке
param_grid = [
    {
        'regressor': [DecisionTreeRegressor()],
        'regressor__max_depth': [3, 5, 7, None],
        'regressor__min_samples_split': [2, 5, 10],
        'regressor__min_samples_leaf': [1, 2, 4]
    },
    {
        'regressor': [RandomForestRegressor()],
        'regressor__n_estimators': [50, 100, 200],
        'regressor__max_depth': [3, 5, 7, None],
        'regressor__min_samples_split': [2, 5, 10],
        'regressor__min_samples_leaf': [1, 2, 4]
    },
    {
        'regressor': [GradientBoostingRegressor()],
        'regressor__n_estimators': [50, 100, 200],
        'regressor__learning_rate': [0.01, 0.1, 0.5],
        'regressor__max_depth': [3, 5, 7],
        'regressor__min_samples_split': [2, 5, 10],
        'regressor__min_samples_leaf': [1, 2, 4]
    },
    {
        'regressor': [SVR()],
        'regressor__kernel': ['linear', 'rbf'],
        'regressor__C': [0.1, 1, 10],
        'regressor__gamma': ['scale', 'auto']
    }
]

# Создание объекта GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1)


grid_search.fit(X_train, y_train)
print("Best parameters set found on development set:")
print(grid_search.best_params_)
print()
print("Grid scores on development set:")
means = grid_search.cv_results_['mean_test_score']
stds = grid_search.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, grid_search.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
          % (mean, std * 2, params))
# model = ElasticNet(alpha=0.1)
# model.fit(X_train, y_train)

Fitting 5 folds for each of 82 candidates, totalling 410 fits
Best parameters set found on development set:
{'regressor': RandomForestRegressor(), 'regressor__max_depth': None, 'regressor__n_estimators': 50}

Grid scores on development set:
0.176 (+/-0.222) for {'regressor': ElasticNet(), 'regressor__alpha': 0.1, 'regressor__l1_ratio': 0.1}
0.197 (+/-0.243) for {'regressor': ElasticNet(), 'regressor__alpha': 0.1, 'regressor__l1_ratio': 0.5}
0.232 (+/-0.280) for {'regressor': ElasticNet(), 'regressor__alpha': 0.1, 'regressor__l1_ratio': 0.9}
0.122 (+/-0.156) for {'regressor': ElasticNet(), 'regressor__alpha': 0.5, 'regressor__l1_ratio': 0.1}
0.141 (+/-0.181) for {'regressor': ElasticNet(), 'regressor__alpha': 0.5, 'regressor__l1_ratio': 0.5}
0.197 (+/-0.243) for {'regressor': ElasticNet(), 'regressor__alpha': 0.5, 'regressor__l1_ratio': 0.9}
0.100 (+/-0.127) for {'regressor': ElasticNet(), 'regressor__alpha': 1.0, 'regressor__l1_ratio': 0.1}
0.119 (+/-0.152) for {'regressor': ElasticNet

In [7]:
grid_search.predict(X_test)

array([ 19936.52031524,  19936.52031524,  19936.52031524,  19936.52031524,
        10739.4490303 ,  10739.4490303 ,  32664.74885401,  32664.74885401,
        32664.74885401,  32664.74885401,  19114.84486454,  19114.84486454,
        19114.84486454,  19114.84486454,  10739.4490303 ,  10739.4490303 ,
       121961.6575455 , 121961.6575455 , 121961.6575455 , 121961.6575455 ,
       121961.6575455 , 104061.73233247, 104061.73233247, 104061.73233247,
       104061.73233247, 104061.73233247, 169685.56421356, 169685.56421356,
       169685.56421356,  58001.4576986 ,  58001.4576986 ,  58001.4576986 ,
       116315.19650397, 116315.19650397, 116315.19650397, 116315.19650397,
       116315.19650397, 116315.19650397,  15570.        ,  15570.        ,
        15570.        ,  15570.        ,  15570.        ,  19004.24662338,
        19004.24662338,  19004.24662338,  34003.77431162,  34003.77431162,
        34003.77431162,  34003.77431162,  34003.77431162,  14848.33004553,
        14848.33004553,  

In [8]:
df_res=pd.read_csv(r"C:\Users\Jora\Desktop\neft\data\smiles_test_set_public.csv")
df_res['result']=grid_search.predict(X_test)
df_res = df_res.drop('smiles', axis=1)
df_res=df_res.drop_duplicates(subset=['blend_id'])
df_res.to_csv("test_lr.csv",index=False)