In [4]:
!pip install pandas numpy scikit-learn shap

Defaulting to user installation because normal site-packages is not writeable
Collecting shap
  Downloading shap-0.50.0-cp312-cp312-win_amd64.whl.metadata (25 kB)
Collecting numpy
  Downloading numpy-2.4.0-cp312-cp312-win_amd64.whl.metadata (6.6 kB)
Collecting slicer==0.0.8 (from shap)
  Downloading slicer-0.0.8-py3-none-any.whl.metadata (4.0 kB)
Collecting numpy
  Using cached numpy-2.0.2-cp312-cp312-win_amd64.whl.metadata (59 kB)
Downloading shap-0.50.0-cp312-cp312-win_amd64.whl (549 kB)
   ---------------------------------------- 0.0/549.3 kB ? eta -:--:--
   ---------------------------------------- 0.0/549.3 kB ? eta -:--:--
   ---------------------------------------- 0.0/549.3 kB ? eta -:--:--
   ---------------------------------------- 0.0/549.3 kB ? eta -:--:--
   ------------------- -------------------- 262.1/549.3 kB ? eta -:--:--
   ------------------- -------------------- 262.1/549.3 kB ? eta -:--:--
   ------------------- -------------------- 262.1/549.3 kB ? eta -:--:--
  

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow-intel 2.18.0 requires ml-dtypes<0.5.0,>=0.4.0, but you have ml-dtypes 0.5.3 which is incompatible.
tensorflow-intel 2.18.0 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0dev,>=3.20.3, but you have protobuf 6.33.0 which is incompatible.
tensorflow-intel 2.18.0 requires tensorboard<2.19,>=2.18, but you have tensorboard 2.20.0 which is incompatible.
contourpy 1.2.0 requires numpy<2.0,>=1.20, but you have numpy 2.0.2 which is incompatible.
gensim 4.3.3 requires numpy<2.0,>=1.18.5, but you have numpy 2.0.2 which is incompatible.
streamlit 1.37.1 requires protobuf<6,>=3.20, but you have protobuf 6.33.0 which is incompatible.


In [None]:
import pandas as pd
import numpy as np
import sklearn

from sklearn.model_selection import GridSearchCV, KFold
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import Ridge, ElasticNet
from sklearn.ensemble import GradientBoostingRegressor

print("scikit-learn version:", sklearn.__version__)

try:
    ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
except TypeError:
    ohe = OneHotEncoder(handle_unknown="ignore", sparse=False)

train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

train = train[train["GrLivArea"] < 4500]

X = train.drop("SalePrice", axis=1)
y = np.log1p(train["SalePrice"])
test_ids = test["Id"]

num_features = X.select_dtypes(include=["int64", "float64"]).columns
cat_features = X.select_dtypes(include=["object"]).columns

num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

cat_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", ohe)
])

preprocessor = ColumnTransformer([
    ("num", num_pipeline, num_features),
    ("cat", cat_pipeline, cat_features)
])

models = {
    "Ridge": Ridge(),
    "ElasticNet": ElasticNet(max_iter=5000),
    "GBR": GradientBoostingRegressor(random_state=42)
}

params = {
    "Ridge": {"model__alpha": [5, 10, 20]},
    "ElasticNet": {
        "model__alpha": [0.0005, 0.001],
        "model__l1_ratio": [0.3, 0.5]
    },
    "GBR": {
        "model__n_estimators": [300],
        "model__learning_rate": [0.05],
        "model__max_depth": [3]
    }
}

kf = KFold(n_splits=5, shuffle=True, random_state=42)

best_rmse = float("inf")
best_model = None

for name, model in models.items():
    pipe = Pipeline([
        ("preprocessor", preprocessor),
        ("model", model)
    ])
    
    grid = GridSearchCV(
        pipe,
        params[name],
        cv=kf,
        scoring="neg_root_mean_squared_error",
        n_jobs=1  # üî• FIXES BrokenProcessPool
    )
    
    grid.fit(X, y)
    rmse = -grid.best_score_
    
    print(f"{name} RMSE: {rmse:.4f}")
    
    if rmse < best_rmse:
        best_rmse = rmse
        best_model = grid.best_estimator_

print("\nüèÜ Best model selected")

best_model.fit(X, y)

test_preds = np.expm1(best_model.predict(test))

submission = pd.DataFrame({
    "Id": test_ids,
    "SalePrice": test_preds
})

submission.to_csv("sub.csv", index=False)
print("‚úÖ submission.csv created successfully")

scikit-learn version: 1.5.1
Ridge RMSE: 0.1145
ElasticNet RMSE: 0.1130
GBR RMSE: 0.1216

üèÜ Best model selected
‚úÖ submission.csv created successfully
