In [1]:
import openml
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# 1. 데이터 로드
dataset_id = 43546
dataset = openml.datasets.get_dataset(dataset_id)
result = dataset.get_data(dataset_format='dataframe')
data = result[0] if isinstance(result, tuple) else result

# 데이터 확인
print("Data loaded successfully!")
print(f"Data shape: {data.shape}")
print(f"Columns: {data.columns}")

Data loaded successfully!
Data shape: (9982, 25)
Columns: Index(['Name', 'InChI', 'InChIKey', 'SMILES', 'Solubility', 'SD', 'Ocurrences',
       'Group', 'MolWt', 'MolLogP', 'MolMR', 'HeavyAtomCount', 'NumHAcceptors',
       'NumHDonors', 'NumHeteroatoms', 'NumRotatableBonds',
       'NumValenceElectrons', 'NumAromaticRings', 'NumSaturatedRings',
       'NumAliphaticRings', 'RingCount', 'TPSA', 'LabuteASA', 'BalabanJ',
       'BertzCT'],
      dtype='object')


In [2]:
# 2. 타겟 변수와 독립 변수 분리
target_column = 'Solubility'
X = data.drop(columns=[target_column])  # 독립 변수
y = data[target_column]  # 타겟 변수

# 3. 고유값이 많은 텍스트 컬럼 제외
columns_to_exclude = ['Name', 'InChI', 'InChIKey', 'SMILES']
X = X.drop(columns=columns_to_exclude)

# 4. 고유값 제한: 원-핫 인코딩이 필요한 컬럼만 인코딩
categorical_columns = X.select_dtypes(include=['object']).columns
for col in categorical_columns:
    if X[col].nunique() > 100:  # 고유값이 100개를 초과하는 경우 제외
        print(f"Excluding column from encoding: {col}")
        X = X.drop(columns=[col])
X = pd.get_dummies(X, drop_first=True)

# 5. 수치형 데이터 스케일링
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 6. 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# 결과 확인
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")

X_train shape: (7985, 23)
X_test shape: (1997, 23)
y_train shape: (7985,)
y_test shape: (1997,)


In [3]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# 1. 모델 생성 - 선형회귀
model = LinearRegression()

# 2. 모델 학습
model.fit(X_train, y_train)

# 3. 예측
y_pred = model.predict(X_test)

# 4. 성능 평가
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"R2 Score: {r2:.4f}")

Mean Squared Error (MSE): 2.7068
R2 Score: 0.5010


In [4]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# 1. 랜덤 포레스트 모델 생성
model = RandomForestRegressor(random_state=42, n_estimators=100)

# 2. 모델 학습
model.fit(X_train, y_train)

# 3. 예측
y_pred = model.predict(X_test)

# 4. 성능 평가
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"R2 Score: {r2:.4f}")


Mean Squared Error (MSE): 1.1960
R2 Score: 0.7795


In [5]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score

# XGBoost 모델 생성
xgb_model = XGBRegressor(random_state=42, n_estimators=100)
xgb_model.fit(X_train, y_train)

# 예측 및 평가
y_pred_xgb = xgb_model.predict(X_test)
mse_xgb = mean_squared_error(y_test, y_pred_xgb)
r2_xgb = r2_score(y_test, y_pred_xgb)

print(f"XGBoost MSE: {mse_xgb:.4f}")
print(f"XGBoost R2: {r2_xgb:.4f}")

XGBoost MSE: 1.2386
XGBoost R2: 0.7717
