In [None]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import Descriptors
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.metrics import mean_absolute_error
from xgboost import XGBRegressor

df = pd.read_csv("../input/train.csv", index_col='id') 

def calculate_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is not None:
        return [Descriptors.MolWt(mol), 
                Descriptors.TPSA(mol), 
                Descriptors.NumHDonors(mol), 
                Descriptors.NumHAcceptors(mol), 
                Descriptors.RingCount(mol), 
                Descriptors.MolLogP(mol), 
                Descriptors.NumRotatableBonds(mol)]
    else:
        return [np.nan] * 7  

smiles_descriptors = df['SMILES'].apply(calculate_descriptors)

smiles_df = pd.DataFrame(smiles_descriptors.tolist(), columns=[ 
    'MolWt', 'TPSA', 'NumHDonors', 'NumHAcceptors', 'RingCount', 'MolLogP', 'NumRotatableBonds'])

numerical_columns = df.select_dtypes(include=[np.number]).drop(columns=['Tm']).columns
X_other = df[numerical_columns]

X_combined = pd.concat([X_other, smiles_df], axis=1)

y = df['Tm']

X_combined_clean = X_combined.dropna()

y_clean = y.loc[X_combined_clean.index].reset_index(drop=True)

X_combined_clean.reset_index(drop=True, inplace=True)

print(f"Shape of X_combined_clean: {X_combined_clean.shape}")
print(f"Shape of y_clean: {y_clean.shape}")

selector = SelectKBest(score_func=f_regression, k=100)
X_selected = selector.fit_transform(X_combined_clean, y_clean)

X_train, X_test, y_train, y_test = train_test_split(X_selected, y_clean, test_size=0.2, random_state=42)

xgb_model = XGBRegressor(n_estimators=1000, random_state=42)
xgb_model.fit(X_train, y_train)
y_pred = xgb_model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error:", mae)


Shape of X_combined_clean: (2144, 431)
Shape of y_clean: (2144,)
Mean Absolute Error: 40.389467155972284
