In [2]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import Descriptors
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error


In [5]:
df = pd.read_csv("../input/train.csv", index_col='id') 

def calculate_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is not None:
        return [Descriptors.MolWt(mol), 
                Descriptors.TPSA(mol), 
                Descriptors.NumHDonors(mol), 
                Descriptors.NumHAcceptors(mol), 
                Descriptors.RingCount(mol), 
                Descriptors.MolLogP(mol), 
                Descriptors.NumRotatableBonds(mol)]
    else:
        return [np.nan] * 8  

X = df['SMILES'].apply(calculate_descriptors)

# Convert the features into a DataFrame
X = pd.DataFrame(X.tolist(), columns=[
    'MolWt', 'TPSA', 'NumHDonors', 'NumHAcceptors', 'RingCount', 'MolLogP', 'NumRotatableBonds'])

# Target variable
y = df['Tm']

# Handle missing values by filling with the column median
X.fillna(X.median(), inplace=True)

# Select the top 100 features using SelectKBest with f_regression
selector = SelectKBest(score_func=f_regression, k=100)
X_selected = selector.fit_transform(X, y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)

# build and evaluate xgb
xgb_model = XGBRegressor(n_estimators=1000, random_state=42)
xgb_model.fit(X_train, y_train)
y_pred = xgb_model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
print(mae)




41.7154707591216
