# Deepseek project

### 1. prepare the data

In [2]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors

# Load the dataset
data = pd.read_csv("sample_solubility.csv")

# Function to convert SMILES to molecules and calculate descriptors
def get_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is not None:
        # Calculate descriptors (returns a dictionary)
        descriptors = Descriptors.CalcMolDescriptors(mol)
        return descriptors
    else:
        return None  # Skip invalid SMILES

# Apply the function to create a descriptor DataFrame
descriptor_list = []
for idx, row in data.iterrows():
    desc = get_descriptors(row["SMILES"])
    if desc is not None:
        desc["SMILES"] = row["SMILES"]  # Track valid SMILES
        desc["Solubility"] = row["Solubility"]
        descriptor_list.append(desc)

# Create final DataFrame
df = pd.DataFrame(descriptor_list).dropna()
print(f"Valid molecules: {len(df)}")

Valid molecules: 29


[13:19:36] SMILES Parse Error: unclosed ring for input: 'C1=CC=C(C=C1)NO2'


What Happens:
Invalid SMILES (e.g., NaCl) are skipped.
Descriptors like MolLogP, MolWt, and NumHAcceptors are calculated.

### 2. Preprocess the Data

In [None]:
# Split into features (X) and target (y)
X = df.drop(["SMILES", "Solubility"], axis=1)
y = df["Solubility"]

# Handle missing values (if any)
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy="median")
X = imputer.fit_transform(X)

# Split into train/test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [1]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Load data
df = pd.read_csv("sample_solubility_2.csv")
df["Solubility"] = pd.to_numeric(df["Solubility"], errors="coerce")
df = df.dropna(subset=["Solubility"])

# Calculate descriptors
def get_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is not None:
        return Descriptors.CalcMolDescriptors(mol)
    return None

descriptor_list = []
for idx, row in df.iterrows():
    desc = get_descriptors(row["SMILES"])
    if desc is not None:
        desc["Solubility"] = row["Solubility"]
        descriptor_list.append(desc)

df_clean = pd.DataFrame(descriptor_list).dropna()

# Split data (X and y are DataFrames/Series)
X = df_clean.drop("Solubility", axis=1)
y = df_clean["Solubility"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert to NumPy arrays AFTER splitting
X_train = X_train.values
X_test = X_test.values
y_train = y_train.values.ravel()  # Flatten to 1D
y_test = y_test.values.ravel()

# Train model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Evaluate
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

print("Train RMSE:", mean_squared_error(y_train, y_pred_train, squared=False))
print("Test RMSE:", mean_squared_error(y_test, y_pred_test, squared=False))
print("Test R²:", r2_score(y_test, y_pred_test))

TypeError: got an unexpected keyword argument 'squared'