<a href="https://colab.research.google.com/github/JJJuniorDev/ML-colab/blob/main/clf_basic_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
import tarfile
import urllib.request
import pandas as pd

DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    os.makedirs(housing_path, exist_ok=True)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()

fetch_housing_data()

housing = pd.read_csv(os.path.join(HOUSING_PATH, "housing.csv"))

  housing_tgz.extractall(path=housing_path)


In [2]:
from sklearn.model_selection import train_test_split

X = housing.drop("median_house_value", axis=1)
y = housing["median_house_value"]

# rimuoviamo la feature categoriale
X = X.drop("ocean_proximity", axis=1)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# prendiamo SOLO le prime 5000 istanze del training set
X_train_5k = X_train.iloc[:3000]
y_train_5k = y_train.iloc[:3000]

In [3]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.base import BaseEstimator, TransformerMixin

class MyTransformer (BaseEstimator, TransformerMixin):
    def __init__(self,n_neighbors=5, lat_col="latitude",
                 lon_col="longitude"):
        self.n_neighbors=n_neighbors
        self.lat_col=lat_col
        self.lon_col=lon_col
        self.model= KNeighborsRegressor(n_neighbors=n_neighbors)



    def _geo(self, X):
        # Se X Ã¨ DataFrame
        if hasattr(X, "loc"):
            return X[[self.lat_col, self.lon_col]].values
        # Se X Ã¨ numpy array (colonne numeriche)
        return X[:, [self.lat_col, self.lon_col]]

    def fit(self, X, y):
        X_geo = self._geo(X)
        self.model.fit(X_geo, y)
        return self

    def transform(self, X):
        X_geo = self._geo(X)
        preds = self.model.predict(X_geo)
        return preds.reshape(-1, 1)


In [7]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR, LinearSVR
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.feature_selection import SelectFromModel

pipeline = Pipeline([
    ("knn_features", MyTransformer(n_neighbors=5)),  # ðŸ‘ˆ PRIMA
    ("scaler", StandardScaler()),
    ("feature_selection", SelectFromModel(LinearSVR(random_state=42))),
    ("svr", SVR())
])

param_grid = [
    {
        "svr__kernel": ["linear"],
        "svr__C": [1, 10, 100, 1000]
    },
    {
        "svr__kernel": ["rbf"],
        "svr__C": [1, 10, 100, 1000],
        "svr__gamma": [0.01, 0.1, 1]
    }
]

grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv=3,                         # âœ… 3-fold CV
    scoring="neg_mean_squared_error",
    n_jobs=-1
)

grid_search.fit(X_train_5k, y_train_5k)

In [8]:
results = pd.DataFrame(grid_search.cv_results_)
results[[
    "params",
    "mean_test_score",
    "rank_test_score"
]].sort_values("rank_test_score").head()


Unnamed: 0,params,mean_test_score,rank_test_score
3,"{'svr__C': 1000, 'svr__kernel': 'linear'}",-4401075000.0,1
14,"{'svr__C': 1000, 'svr__gamma': 0.1, 'svr__kern...",-4856778000.0,2
15,"{'svr__C': 1000, 'svr__gamma': 1, 'svr__kernel...",-4886532000.0,3
2,"{'svr__C': 100, 'svr__kernel': 'linear'}",-4921790000.0,4
12,"{'svr__C': 100, 'svr__gamma': 1, 'svr__kernel'...",-9628081000.0,5
