In [23]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV

In [24]:
df = pd.read_csv("housing-train-data-6628a4723213d886993351.csv")

numerical_cols = ['longitude', 'latitude', 'housing_median_age',
       'total_rooms', 'total_bedrooms', 'population', 'households',
       'median_income']
categorical_cols = ['ocean_proximity']

X = df.drop(columns=["median_house_value", 'Unnamed: 0'])
y = df['median_house_value']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

numerical_pipe = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', RobustScaler())])

categorical_pipe = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipe, numerical_cols),
        ('cat', categorical_pipe, categorical_cols)])

model = KNeighborsRegressor()

pipeline = Pipeline([('preprocessor', preprocessor),
                     ('model', model)])

pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

param_grid = {
    'model__n_neighbors': [5, 7, 9, 12, 15, 20],
    'model__weights': ['uniform', 'distance'],
    'model__metric': ['euclidean', 'manhattan']
}
grid_pipeline = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_squared_error')

grid_pipeline.fit(X_train, y_train)

y_pred = grid_pipeline.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("RMSE", np.sqrt(mse))
print("R^2", r2)
print("Meilleurs paramètres:", grid_pipeline.best_params_)

RMSE 58146.19510340582
R^2 0.7439318885987141
Meilleurs paramètres: {'model__metric': 'manhattan', 'model__n_neighbors': 9, 'model__weights': 'distance'}


In [14]:
import joblib

joblib.dump(grid_pipeline, 'modele_knn.joblib')

loaded_model = joblib.load('modele_knn.joblib')

y_pred_loaded_model = loaded_model.predict(X_test)

mse_loaded_model = mean_squared_error(y_test, y_pred_loaded_model)
r2_loaded_model = r2_score(y_test, y_pred_loaded_model)

print("Mean Squared Error (Modèle chargé):", np.sqrt(mse_loaded_model))
print("R^2 Score (Modèle chargé):", r2_loaded_model)
print("Meilleurs paramètres (Modèle chargé):", loaded_model.best_params_)

Mean Squared Error (Modèle chargé): 59668.06635488564
R^2 Score (Modèle chargé): 0.7303522355435221
Meilleurs paramètres (Modèle chargé): {'model__algorithm': 'auto', 'model__n_neighbors': 10, 'model__weights': 'distance', 'preprocessor__num__imputer__strategy': 'median'}
